{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 5804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006891798759476223, "grad_norm": 1.1717361211776733, "learning_rate": 8.605851979345955e-11, "logits/chosen": -3.184086799621582, "logits/rejected": -3.1319174766540527, "logps/chosen": -49.95408630371094, "logps/rejected": -44.33523178100586, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.006891798759476223, "grad_norm": 1.0662882328033447, "learning_rate": 8.605851979345954e-10, "logits/chosen": -3.0800631046295166, "logits/rejected": -3.05961012840271, "logps/chosen": -54.045631408691406, "logps/rejected": -53.65376663208008, "loss": 0.6932, "rewards/accuracies": 0.4427083432674408, "rewards/chosen": 1.1835111763502937e-05, "rewards/margins": -7.020534394541755e-05, "rewards/rejected": 8.204047480830923e-05, "step": 10 }, { "epoch": 0.013783597518952447, "grad_norm": 1.169674277305603, "learning_rate": 1.7211703958691908e-09, "logits/chosen": -3.1165599822998047, "logits/rejected": -3.091635227203369, "logps/chosen": -55.89078903198242, "logps/rejected": -53.24898147583008, "loss": 0.6931, "rewards/accuracies": 0.47343748807907104, "rewards/chosen": 1.3864305401511956e-05, "rewards/margins": -9.870767598840757e-07, "rewards/rejected": 1.4851379091851413e-05, "step": 20 }, { "epoch": 0.02067539627842867, "grad_norm": 1.2944797277450562, "learning_rate": 2.5817555938037863e-09, "logits/chosen": -3.087826728820801, "logits/rejected": -3.0587966442108154, "logps/chosen": -54.553993225097656, "logps/rejected": -52.59638595581055, "loss": 0.6932, "rewards/accuracies": 0.4828124940395355, "rewards/chosen": -0.00010204836871707812, "rewards/margins": -7.52997730160132e-05, "rewards/rejected": -2.674859388207551e-05, "step": 30 }, { "epoch": 0.027567195037904894, "grad_norm": 1.1855700016021729, "learning_rate": 3.4423407917383816e-09, "logits/chosen": -3.0851259231567383, "logits/rejected": -3.0673484802246094, "logps/chosen": -53.86628341674805, "logps/rejected": -53.6602897644043, "loss": 0.6931, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": 7.134721090551466e-05, "rewards/margins": 0.00011767115211114287, "rewards/rejected": -4.63239339296706e-05, "step": 40 }, { "epoch": 0.03445899379738112, "grad_norm": 1.2432990074157715, "learning_rate": 4.302925989672977e-09, "logits/chosen": -3.080653667449951, "logits/rejected": -3.056288957595825, "logps/chosen": -56.23808670043945, "logps/rejected": -53.091957092285156, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.00011039915261790156, "rewards/margins": 0.0001242260477738455, "rewards/rejected": -1.3826892427459825e-05, "step": 50 }, { "epoch": 0.04135079255685734, "grad_norm": 1.1337426900863647, "learning_rate": 5.163511187607573e-09, "logits/chosen": -3.0350937843322754, "logits/rejected": -3.009911298751831, "logps/chosen": -52.5786018371582, "logps/rejected": -52.6715087890625, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 3.223800376872532e-05, "rewards/margins": -1.786904249456711e-05, "rewards/rejected": 5.010703898733482e-05, "step": 60 }, { "epoch": 0.048242591316333565, "grad_norm": 1.2362148761749268, "learning_rate": 6.024096385542168e-09, "logits/chosen": -3.0923497676849365, "logits/rejected": -3.0710818767547607, "logps/chosen": -54.46764373779297, "logps/rejected": -53.85224533081055, "loss": 0.6932, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 2.844607843144331e-05, "rewards/margins": -3.54228941432666e-05, "rewards/rejected": 6.38689671177417e-05, "step": 70 }, { "epoch": 0.05513439007580979, "grad_norm": 1.1120648384094238, "learning_rate": 6.884681583476763e-09, "logits/chosen": -3.0346508026123047, "logits/rejected": -3.0209646224975586, "logps/chosen": -54.0633430480957, "logps/rejected": -52.68952560424805, "loss": 0.6932, "rewards/accuracies": 0.4828124940395355, "rewards/chosen": -1.7875881894724444e-05, "rewards/margins": -2.6181824068771675e-05, "rewards/rejected": 8.305940355057828e-06, "step": 80 }, { "epoch": 0.06202618883528601, "grad_norm": 1.1980597972869873, "learning_rate": 7.745266781411359e-09, "logits/chosen": -3.0487260818481445, "logits/rejected": -3.021777629852295, "logps/chosen": -54.59843063354492, "logps/rejected": -52.0598030090332, "loss": 0.6932, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -9.265693370252848e-05, "rewards/margins": -3.572347486624494e-05, "rewards/rejected": -5.693346611224115e-05, "step": 90 }, { "epoch": 0.06891798759476224, "grad_norm": 1.3346056938171387, "learning_rate": 8.605851979345954e-09, "logits/chosen": -3.119574785232544, "logits/rejected": -3.095754384994507, "logps/chosen": -53.645774841308594, "logps/rejected": -52.88176345825195, "loss": 0.6931, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": 2.6289630113751628e-05, "rewards/margins": 7.76806118665263e-05, "rewards/rejected": -5.139098357176408e-05, "step": 100 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.1629772186279297, "eval_logits/rejected": -3.157360076904297, "eval_logps/chosen": -58.71721649169922, "eval_logps/rejected": -63.174400329589844, "eval_loss": 0.6932030320167542, "eval_rewards/accuracies": 0.47932156920433044, "eval_rewards/chosen": -5.322263314155862e-05, "eval_rewards/margins": -0.00011042186088161543, "eval_rewards/rejected": 5.719922774005681e-05, "eval_runtime": 382.606, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 100 }, { "epoch": 0.07580978635423846, "grad_norm": 1.2332243919372559, "learning_rate": 9.46643717728055e-09, "logits/chosen": -3.08892822265625, "logits/rejected": -3.0736162662506104, "logps/chosen": -53.078948974609375, "logps/rejected": -54.2159538269043, "loss": 0.6931, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": -2.114586823154241e-05, "rewards/margins": 1.0802812539623119e-05, "rewards/rejected": -3.194867167621851e-05, "step": 110 }, { "epoch": 0.08270158511371468, "grad_norm": 1.2837666273117065, "learning_rate": 1.0327022375215145e-08, "logits/chosen": -3.043341875076294, "logits/rejected": -3.021089553833008, "logps/chosen": -54.95210647583008, "logps/rejected": -54.480567932128906, "loss": 0.6932, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -1.86043562280247e-05, "rewards/margins": -8.284465729957446e-05, "rewards/rejected": 6.424028106266633e-05, "step": 120 }, { "epoch": 0.08959338387319091, "grad_norm": 1.1369248628616333, "learning_rate": 1.1187607573149742e-08, "logits/chosen": -3.010157346725464, "logits/rejected": -2.9788200855255127, "logps/chosen": -57.558807373046875, "logps/rejected": -51.64166259765625, "loss": 0.6931, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": 3.0017108656466007e-05, "rewards/margins": 6.624778325203806e-05, "rewards/rejected": -3.623068187152967e-05, "step": 130 }, { "epoch": 0.09648518263266713, "grad_norm": 1.1683388948440552, "learning_rate": 1.2048192771084337e-08, "logits/chosen": -3.0672736167907715, "logits/rejected": -3.046203136444092, "logps/chosen": -53.5539665222168, "logps/rejected": -52.76469039916992, "loss": 0.6931, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 4.6087061491562054e-05, "rewards/margins": 0.00010134129843208939, "rewards/rejected": -5.5254222388612106e-05, "step": 140 }, { "epoch": 0.10337698139214335, "grad_norm": 1.2668614387512207, "learning_rate": 1.2908777969018933e-08, "logits/chosen": -3.043257474899292, "logits/rejected": -3.0280776023864746, "logps/chosen": -52.8119010925293, "logps/rejected": -54.632606506347656, "loss": 0.6932, "rewards/accuracies": 0.47968751192092896, "rewards/chosen": -3.67345564882271e-05, "rewards/margins": -6.840623882453656e-06, "rewards/rejected": -2.98939357890049e-05, "step": 150 }, { "epoch": 0.11026878015161957, "grad_norm": 1.210976243019104, "learning_rate": 1.3769363166953526e-08, "logits/chosen": -3.093601942062378, "logits/rejected": -3.076836585998535, "logps/chosen": -53.5924186706543, "logps/rejected": -52.919166564941406, "loss": 0.6932, "rewards/accuracies": 0.4703125059604645, "rewards/chosen": -2.290920565428678e-05, "rewards/margins": -0.00010185151040786877, "rewards/rejected": 7.894229929661378e-05, "step": 160 }, { "epoch": 0.1171605789110958, "grad_norm": 1.1882301568984985, "learning_rate": 1.4629948364888123e-08, "logits/chosen": -3.030766010284424, "logits/rejected": -3.022057294845581, "logps/chosen": -53.258087158203125, "logps/rejected": -53.862396240234375, "loss": 0.6932, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 2.349318310734816e-05, "rewards/margins": -2.9962649932713248e-05, "rewards/rejected": 5.345585668692365e-05, "step": 170 }, { "epoch": 0.12405237767057202, "grad_norm": 1.1534204483032227, "learning_rate": 1.5490533562822718e-08, "logits/chosen": -3.0753283500671387, "logits/rejected": -3.0525062084198, "logps/chosen": -55.68818283081055, "logps/rejected": -53.16033935546875, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 5.954229709459469e-05, "rewards/margins": 0.00011722864292096347, "rewards/rejected": -5.768631672253832e-05, "step": 180 }, { "epoch": 0.13094417643004824, "grad_norm": 1.1747231483459473, "learning_rate": 1.6351118760757314e-08, "logits/chosen": -3.1031861305236816, "logits/rejected": -3.073702335357666, "logps/chosen": -55.4235954284668, "logps/rejected": -52.44438934326172, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 9.255047189071774e-06, "rewards/margins": 9.479814616497606e-05, "rewards/rejected": -8.554311352781951e-05, "step": 190 }, { "epoch": 0.13783597518952448, "grad_norm": 1.2227264642715454, "learning_rate": 1.7211703958691908e-08, "logits/chosen": -3.070514678955078, "logits/rejected": -3.0420584678649902, "logps/chosen": -53.74884033203125, "logps/rejected": -52.8384895324707, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -7.974383333930746e-05, "rewards/margins": -5.399574729381129e-05, "rewards/rejected": -2.5748076950549148e-05, "step": 200 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.1632611751556396, "eval_logits/rejected": -3.1576225757598877, "eval_logps/chosen": -58.70291519165039, "eval_logps/rejected": -63.17155456542969, "eval_loss": 0.693145751953125, "eval_rewards/accuracies": 0.49558550119400024, "eval_rewards/chosen": 8.983318548416719e-05, "eval_rewards/margins": 4.122095106140478e-06, "eval_rewards/rejected": 8.571110083721578e-05, "eval_runtime": 382.7399, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 200 }, { "epoch": 0.1447277739490007, "grad_norm": 1.14228355884552, "learning_rate": 1.8072289156626504e-08, "logits/chosen": -3.0896146297454834, "logits/rejected": -3.0656676292419434, "logps/chosen": -54.227699279785156, "logps/rejected": -52.46704864501953, "loss": 0.6931, "rewards/accuracies": 0.510937511920929, "rewards/chosen": -0.0001275625836569816, "rewards/margins": 2.1607429516734555e-05, "rewards/rejected": -0.00014916998043190688, "step": 210 }, { "epoch": 0.15161957270847692, "grad_norm": 1.1051299571990967, "learning_rate": 1.89328743545611e-08, "logits/chosen": -3.053691864013672, "logits/rejected": -3.0393364429473877, "logps/chosen": -51.6849250793457, "logps/rejected": -53.0732421875, "loss": 0.6932, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -6.968495290493593e-05, "rewards/margins": -3.246773485443555e-05, "rewards/rejected": -3.721721805050038e-05, "step": 220 }, { "epoch": 0.15851137146795313, "grad_norm": 1.2511394023895264, "learning_rate": 1.9793459552495694e-08, "logits/chosen": -3.071871280670166, "logits/rejected": -3.0470237731933594, "logps/chosen": -54.49553680419922, "logps/rejected": -52.023338317871094, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -2.2832475224277005e-05, "rewards/margins": 6.969098467379808e-05, "rewards/rejected": -9.252345626009628e-05, "step": 230 }, { "epoch": 0.16540317022742937, "grad_norm": 1.141005039215088, "learning_rate": 2.065404475043029e-08, "logits/chosen": -3.0287063121795654, "logits/rejected": -3.0117838382720947, "logps/chosen": -54.675682067871094, "logps/rejected": -55.165077209472656, "loss": 0.6931, "rewards/accuracies": 0.5390625, "rewards/chosen": -2.109336492139846e-06, "rewards/margins": 0.00013083851081319153, "rewards/rejected": -0.000132947854581289, "step": 240 }, { "epoch": 0.17229496898690558, "grad_norm": 1.1711848974227905, "learning_rate": 2.1514629948364887e-08, "logits/chosen": -3.065638780593872, "logits/rejected": -3.0389957427978516, "logps/chosen": -57.044212341308594, "logps/rejected": -52.916351318359375, "loss": 0.6932, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": -6.782300624763593e-05, "rewards/margins": -1.9592809621826746e-05, "rewards/rejected": -4.823019844479859e-05, "step": 250 }, { "epoch": 0.17918676774638181, "grad_norm": 1.149910807609558, "learning_rate": 2.2375215146299484e-08, "logits/chosen": -3.065356731414795, "logits/rejected": -3.049720525741577, "logps/chosen": -54.497764587402344, "logps/rejected": -54.91572189331055, "loss": 0.693, "rewards/accuracies": 0.5234375, "rewards/chosen": -3.3664375223452225e-05, "rewards/margins": 0.00020539489923976362, "rewards/rejected": -0.00023905928537715226, "step": 260 }, { "epoch": 0.18607856650585802, "grad_norm": 1.1320325136184692, "learning_rate": 2.3235800344234077e-08, "logits/chosen": -3.0667295455932617, "logits/rejected": -3.0387139320373535, "logps/chosen": -56.35082244873047, "logps/rejected": -52.4027099609375, "loss": 0.6931, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.00010486433893674985, "rewards/margins": 0.00010128649591933936, "rewards/rejected": -0.00020615084213204682, "step": 270 }, { "epoch": 0.19297036526533426, "grad_norm": 1.24606454372406, "learning_rate": 2.4096385542168673e-08, "logits/chosen": -3.056931257247925, "logits/rejected": -3.051713228225708, "logps/chosen": -53.108978271484375, "logps/rejected": -54.366554260253906, "loss": 0.6931, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": -7.482778164558113e-05, "rewards/margins": -3.2262701097351965e-06, "rewards/rejected": -7.160151290008798e-05, "step": 280 }, { "epoch": 0.19986216402481047, "grad_norm": 1.1727635860443115, "learning_rate": 2.495697074010327e-08, "logits/chosen": -3.0859344005584717, "logits/rejected": -3.0643935203552246, "logps/chosen": -54.0856819152832, "logps/rejected": -54.075660705566406, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.00011745165102183819, "rewards/margins": 0.00011225246998947114, "rewards/rejected": -0.00022970412101130933, "step": 290 }, { "epoch": 0.2067539627842867, "grad_norm": 1.116284728050232, "learning_rate": 2.5817555938037866e-08, "logits/chosen": -3.0129384994506836, "logits/rejected": -2.9892992973327637, "logps/chosen": -54.18927764892578, "logps/rejected": -51.860084533691406, "loss": 0.693, "rewards/accuracies": 0.5453125238418579, "rewards/chosen": -3.015759284608066e-05, "rewards/margins": 0.00020299448806326836, "rewards/rejected": -0.00023315209546126425, "step": 300 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.1631648540496826, "eval_logits/rejected": -3.1575493812561035, "eval_logps/chosen": -58.70024108886719, "eval_logps/rejected": -63.15769577026367, "eval_loss": 0.6932016015052795, "eval_rewards/accuracies": 0.4723513126373291, "eval_rewards/chosen": 0.00011654118134174496, "eval_rewards/margins": -0.00010772919631563127, "eval_rewards/rejected": 0.000224270363105461, "eval_runtime": 382.6201, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 300 }, { "epoch": 0.2136457615437629, "grad_norm": 1.191618800163269, "learning_rate": 2.667814113597246e-08, "logits/chosen": -3.087235927581787, "logits/rejected": -3.057124376296997, "logps/chosen": -53.848426818847656, "logps/rejected": -52.60887908935547, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -9.047386993188411e-05, "rewards/margins": 0.00013814482372254133, "rewards/rejected": -0.00022861870820634067, "step": 310 }, { "epoch": 0.22053756030323915, "grad_norm": 1.1519832611083984, "learning_rate": 2.7538726333907053e-08, "logits/chosen": -3.076031446456909, "logits/rejected": -3.0592520236968994, "logps/chosen": -52.461647033691406, "logps/rejected": -54.05097198486328, "loss": 0.6932, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.00029962151893414557, "rewards/margins": -1.4733945135958493e-05, "rewards/rejected": -0.00028488755924627185, "step": 320 }, { "epoch": 0.22742935906271536, "grad_norm": 1.1422587633132935, "learning_rate": 2.8399311531841653e-08, "logits/chosen": -3.07609224319458, "logits/rejected": -3.049673080444336, "logps/chosen": -53.41454315185547, "logps/rejected": -51.115447998046875, "loss": 0.693, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.00020497883087955415, "rewards/margins": 0.0002036425721598789, "rewards/rejected": -0.0004086214175913483, "step": 330 }, { "epoch": 0.2343211578221916, "grad_norm": 1.354000449180603, "learning_rate": 2.9259896729776246e-08, "logits/chosen": -3.058897018432617, "logits/rejected": -3.0365147590637207, "logps/chosen": -54.9308967590332, "logps/rejected": -53.086883544921875, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00015698981587775052, "rewards/margins": 4.799760790774599e-05, "rewards/rejected": -0.00020498744561336935, "step": 340 }, { "epoch": 0.2412129565816678, "grad_norm": 1.2231907844543457, "learning_rate": 3.012048192771084e-08, "logits/chosen": -3.083113193511963, "logits/rejected": -3.0603020191192627, "logps/chosen": -54.93220138549805, "logps/rejected": -52.8737678527832, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.6437613491434604e-05, "rewards/margins": 0.00027541659073904157, "rewards/rejected": -0.000321854226058349, "step": 350 }, { "epoch": 0.24810475534114404, "grad_norm": 1.1050974130630493, "learning_rate": 3.0981067125645436e-08, "logits/chosen": -3.0406999588012695, "logits/rejected": -3.0284476280212402, "logps/chosen": -52.319671630859375, "logps/rejected": -53.22536087036133, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.0002593585813883692, "rewards/margins": 0.00014941213885322213, "rewards/rejected": -0.0004087706911377609, "step": 360 }, { "epoch": 0.2549965541006203, "grad_norm": 1.1197164058685303, "learning_rate": 3.184165232358003e-08, "logits/chosen": -3.1160285472869873, "logits/rejected": -3.080443859100342, "logps/chosen": -53.897254943847656, "logps/rejected": -52.24829864501953, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.00019978820637334138, "rewards/margins": 0.0002781632065307349, "rewards/rejected": -0.0004779514274559915, "step": 370 }, { "epoch": 0.2618883528600965, "grad_norm": 1.135910153388977, "learning_rate": 3.270223752151463e-08, "logits/chosen": -3.041220188140869, "logits/rejected": -3.0233259201049805, "logps/chosen": -54.58507537841797, "logps/rejected": -52.707801818847656, "loss": 0.6931, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": -0.0002783517411444336, "rewards/margins": 0.00014765023661311716, "rewards/rejected": -0.000426001992309466, "step": 380 }, { "epoch": 0.2687801516195727, "grad_norm": 1.1035844087600708, "learning_rate": 3.356282271944922e-08, "logits/chosen": -3.024963617324829, "logits/rejected": -3.0033364295959473, "logps/chosen": -52.98070526123047, "logps/rejected": -50.801490783691406, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0002790427242871374, "rewards/margins": 0.0003396864631213248, "rewards/rejected": -0.0006187291583046317, "step": 390 }, { "epoch": 0.27567195037904896, "grad_norm": 1.1149168014526367, "learning_rate": 3.4423407917383815e-08, "logits/chosen": -3.052011251449585, "logits/rejected": -3.0366580486297607, "logps/chosen": -52.822052001953125, "logps/rejected": -53.987640380859375, "loss": 0.693, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": -0.00036801007809117436, "rewards/margins": 0.00022597968927584589, "rewards/rejected": -0.0005939897382631898, "step": 400 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.1625325679779053, "eval_logits/rejected": -3.1568939685821533, "eval_logps/chosen": -58.6827278137207, "eval_logps/rejected": -63.15472412109375, "eval_loss": 0.6931291222572327, "eval_rewards/accuracies": 0.5006970167160034, "eval_rewards/chosen": 0.00029171525966376066, "eval_rewards/margins": 3.7770030758110806e-05, "eval_rewards/rejected": 0.0002539452980272472, "eval_runtime": 382.4785, "eval_samples_per_second": 11.253, "eval_steps_per_second": 1.407, "step": 400 }, { "epoch": 0.28256374913852517, "grad_norm": 1.253456950187683, "learning_rate": 3.5283993115318415e-08, "logits/chosen": -3.079643726348877, "logits/rejected": -3.0518524646759033, "logps/chosen": -54.9720458984375, "logps/rejected": -53.8119010925293, "loss": 0.6927, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.00022477764287032187, "rewards/margins": 0.0008114264346659184, "rewards/rejected": -0.001036203932017088, "step": 410 }, { "epoch": 0.2894555478980014, "grad_norm": 1.2392370700836182, "learning_rate": 3.614457831325301e-08, "logits/chosen": -3.1075050830841064, "logits/rejected": -3.0766782760620117, "logps/chosen": -55.606849670410156, "logps/rejected": -53.67010498046875, "loss": 0.6928, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.00024112523533403873, "rewards/margins": 0.00067254772875458, "rewards/rejected": -0.0009136729058809578, "step": 420 }, { "epoch": 0.2963473466574776, "grad_norm": 1.1693394184112549, "learning_rate": 3.70051635111876e-08, "logits/chosen": -3.0580883026123047, "logits/rejected": -3.0378589630126953, "logps/chosen": -55.03847122192383, "logps/rejected": -53.360572814941406, "loss": 0.693, "rewards/accuracies": 0.542187511920929, "rewards/chosen": -0.0005246425280347466, "rewards/margins": 0.00026738576707430184, "rewards/rejected": -0.0007920282077975571, "step": 430 }, { "epoch": 0.30323914541695385, "grad_norm": 1.195412278175354, "learning_rate": 3.78657487091222e-08, "logits/chosen": -3.0845541954040527, "logits/rejected": -3.0656802654266357, "logps/chosen": -54.361412048339844, "logps/rejected": -53.98042678833008, "loss": 0.6929, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.0005021851393394172, "rewards/margins": 0.0004464749654289335, "rewards/rejected": -0.0009486600756645203, "step": 440 }, { "epoch": 0.31013094417643006, "grad_norm": 1.2172218561172485, "learning_rate": 3.8726333907056795e-08, "logits/chosen": -3.112600088119507, "logits/rejected": -3.0922322273254395, "logps/chosen": -53.38554000854492, "logps/rejected": -53.096343994140625, "loss": 0.6929, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.0006109057576395571, "rewards/margins": 0.0005376356421038508, "rewards/rejected": -0.0011485414579510689, "step": 450 }, { "epoch": 0.31702274293590627, "grad_norm": 1.1300045251846313, "learning_rate": 3.958691910499139e-08, "logits/chosen": -3.1189820766448975, "logits/rejected": -3.0968446731567383, "logps/chosen": -55.58698654174805, "logps/rejected": -53.63945770263672, "loss": 0.6929, "rewards/accuracies": 0.5453125238418579, "rewards/chosen": -0.0005132319638505578, "rewards/margins": 0.0005019743111915886, "rewards/rejected": -0.0010152062168344855, "step": 460 }, { "epoch": 0.3239145416953825, "grad_norm": 1.2021775245666504, "learning_rate": 4.044750430292599e-08, "logits/chosen": -3.066100835800171, "logits/rejected": -3.0411181449890137, "logps/chosen": -54.836082458496094, "logps/rejected": -53.570350646972656, "loss": 0.6926, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.0004428374522831291, "rewards/margins": 0.0010055055608972907, "rewards/rejected": -0.001448343158699572, "step": 470 }, { "epoch": 0.33080634045485874, "grad_norm": 1.2285805940628052, "learning_rate": 4.130808950086058e-08, "logits/chosen": -3.0582435131073, "logits/rejected": -3.032745361328125, "logps/chosen": -55.137550354003906, "logps/rejected": -53.5596923828125, "loss": 0.6928, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.0005406636046245694, "rewards/margins": 0.0007832925766706467, "rewards/rejected": -0.001323956297710538, "step": 480 }, { "epoch": 0.33769813921433495, "grad_norm": 1.1076241731643677, "learning_rate": 4.216867469879518e-08, "logits/chosen": -3.047879695892334, "logits/rejected": -3.0287106037139893, "logps/chosen": -52.995025634765625, "logps/rejected": -53.45074462890625, "loss": 0.6927, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0007063150987960398, "rewards/margins": 0.0009296282078139484, "rewards/rejected": -0.0016359431901946664, "step": 490 }, { "epoch": 0.34458993797381116, "grad_norm": 1.24040687084198, "learning_rate": 4.3029259896729774e-08, "logits/chosen": -3.0817952156066895, "logits/rejected": -3.0658888816833496, "logps/chosen": -53.2136116027832, "logps/rejected": -53.08428955078125, "loss": 0.6927, "rewards/accuracies": 0.557812511920929, "rewards/chosen": -0.0008117581601254642, "rewards/margins": 0.000863810651935637, "rewards/rejected": -0.0016755687538534403, "step": 500 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.1619200706481934, "eval_logits/rejected": -3.156308650970459, "eval_logps/chosen": -58.651756286621094, "eval_logps/rejected": -63.135921478271484, "eval_loss": 0.6930687427520752, "eval_rewards/accuracies": 0.5127788186073303, "eval_rewards/chosen": 0.0006014005630277097, "eval_rewards/margins": 0.00015941854508128017, "eval_rewards/rejected": 0.0004419820907060057, "eval_runtime": 382.31, "eval_samples_per_second": 11.258, "eval_steps_per_second": 1.407, "step": 500 }, { "epoch": 0.35148173673328736, "grad_norm": 1.1800665855407715, "learning_rate": 4.388984509466437e-08, "logits/chosen": -3.070990562438965, "logits/rejected": -3.049795150756836, "logps/chosen": -53.887001037597656, "logps/rejected": -54.08222579956055, "loss": 0.6927, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0008714848081581295, "rewards/margins": 0.0009107475052587688, "rewards/rejected": -0.001782232546247542, "step": 510 }, { "epoch": 0.35837353549276363, "grad_norm": 1.1369861364364624, "learning_rate": 4.475043029259897e-08, "logits/chosen": -3.112302303314209, "logits/rejected": -3.0830445289611816, "logps/chosen": -53.50897216796875, "logps/rejected": -53.0218620300293, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0008085424196906388, "rewards/margins": 0.0008950178744271398, "rewards/rejected": -0.0017035603523254395, "step": 520 }, { "epoch": 0.36526533425223984, "grad_norm": 1.205866813659668, "learning_rate": 4.561101549053356e-08, "logits/chosen": -3.054050922393799, "logits/rejected": -3.041537046432495, "logps/chosen": -53.141502380371094, "logps/rejected": -53.188323974609375, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.000873840763233602, "rewards/margins": 0.0009431767975911498, "rewards/rejected": -0.0018170175608247519, "step": 530 }, { "epoch": 0.37215713301171605, "grad_norm": 1.1606783866882324, "learning_rate": 4.6471600688468154e-08, "logits/chosen": -3.0061757564544678, "logits/rejected": -2.9840445518493652, "logps/chosen": -53.2663459777832, "logps/rejected": -51.14638137817383, "loss": 0.6926, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.0012356654042378068, "rewards/margins": 0.001098443171940744, "rewards/rejected": -0.0023341085761785507, "step": 540 }, { "epoch": 0.37904893177119225, "grad_norm": 1.2817575931549072, "learning_rate": 4.7332185886402753e-08, "logits/chosen": -3.151078939437866, "logits/rejected": -3.13012433052063, "logps/chosen": -54.49916458129883, "logps/rejected": -53.452171325683594, "loss": 0.6926, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.0012268421705812216, "rewards/margins": 0.0011314301518723369, "rewards/rejected": -0.0023582722060382366, "step": 550 }, { "epoch": 0.3859407305306685, "grad_norm": 1.3084540367126465, "learning_rate": 4.8192771084337347e-08, "logits/chosen": -3.0619988441467285, "logits/rejected": -3.0388872623443604, "logps/chosen": -54.879478454589844, "logps/rejected": -53.23296356201172, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0010658774990588427, "rewards/margins": 0.0015331886243075132, "rewards/rejected": -0.002599066123366356, "step": 560 }, { "epoch": 0.3928325292901447, "grad_norm": 1.147674798965454, "learning_rate": 4.905335628227194e-08, "logits/chosen": -3.0011484622955322, "logits/rejected": -2.9750239849090576, "logps/chosen": -56.04889678955078, "logps/rejected": -54.77867889404297, "loss": 0.6924, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -0.0013328868662938476, "rewards/margins": 0.0015611122362315655, "rewards/rejected": -0.002893999218940735, "step": 570 }, { "epoch": 0.39972432804962094, "grad_norm": 1.2272933721542358, "learning_rate": 4.991394148020654e-08, "logits/chosen": -3.051095485687256, "logits/rejected": -3.0210120677948, "logps/chosen": -54.47142791748047, "logps/rejected": -52.838462829589844, "loss": 0.6921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0013455904554575682, "rewards/margins": 0.0020441694650799036, "rewards/rejected": -0.003389759687706828, "step": 580 }, { "epoch": 0.4066161268090972, "grad_norm": 1.1574183702468872, "learning_rate": 4.9999633685875244e-08, "logits/chosen": -3.060479164123535, "logits/rejected": -3.0428626537323, "logps/chosen": -54.345176696777344, "logps/rejected": -53.871498107910156, "loss": 0.6922, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0016159784281626344, "rewards/margins": 0.0019019825849682093, "rewards/rejected": -0.003517960663884878, "step": 590 }, { "epoch": 0.4135079255685734, "grad_norm": 1.0794343948364258, "learning_rate": 4.9998367428608654e-08, "logits/chosen": -3.092864513397217, "logits/rejected": -3.0729570388793945, "logps/chosen": -54.35276412963867, "logps/rejected": -53.6497917175293, "loss": 0.6922, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.001930914237163961, "rewards/margins": 0.0018496786942705512, "rewards/rejected": -0.003780592931434512, "step": 600 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.1600301265716553, "eval_logits/rejected": -3.154381513595581, "eval_logps/chosen": -58.62492752075195, "eval_logps/rejected": -63.12952423095703, "eval_loss": 0.6929681301116943, "eval_rewards/accuracies": 0.535780668258667, "eval_rewards/chosen": 0.0008697069133631885, "eval_rewards/margins": 0.00036371115129441023, "eval_rewards/rejected": 0.0005059958784841001, "eval_runtime": 382.6387, "eval_samples_per_second": 11.248, "eval_steps_per_second": 1.406, "step": 600 }, { "epoch": 0.4203997243280496, "grad_norm": 1.1758496761322021, "learning_rate": 4.999619675160485e-08, "logits/chosen": -3.1056699752807617, "logits/rejected": -3.0810115337371826, "logps/chosen": -55.1336555480957, "logps/rejected": -52.96571731567383, "loss": 0.6921, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.0015057630371302366, "rewards/margins": 0.002140932949259877, "rewards/rejected": -0.003646695986390114, "step": 610 }, { "epoch": 0.4272915230875258, "grad_norm": 1.1999083757400513, "learning_rate": 4.999312173339707e-08, "logits/chosen": -3.081458568572998, "logits/rejected": -3.054145336151123, "logps/chosen": -54.88127899169922, "logps/rejected": -52.38240432739258, "loss": 0.6923, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.0022574197500944138, "rewards/margins": 0.0017544630682095885, "rewards/rejected": -0.004011882934719324, "step": 620 }, { "epoch": 0.4341833218470021, "grad_norm": 1.1177016496658325, "learning_rate": 4.998914248523688e-08, "logits/chosen": -3.044602870941162, "logits/rejected": -3.0268893241882324, "logps/chosen": -53.19904327392578, "logps/rejected": -52.6724853515625, "loss": 0.6925, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.0025318977423012257, "rewards/margins": 0.0013229602482169867, "rewards/rejected": -0.003854858223348856, "step": 630 }, { "epoch": 0.4410751206064783, "grad_norm": 1.2482026815414429, "learning_rate": 4.998425915109009e-08, "logits/chosen": -3.094036817550659, "logits/rejected": -3.0758235454559326, "logps/chosen": -54.895851135253906, "logps/rejected": -54.4415283203125, "loss": 0.6921, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.002324806060642004, "rewards/margins": 0.0021106041967868805, "rewards/rejected": -0.0044354102574288845, "step": 640 }, { "epoch": 0.4479669193659545, "grad_norm": 1.2178966999053955, "learning_rate": 4.9978471907631604e-08, "logits/chosen": -3.0776593685150146, "logits/rejected": -3.0467400550842285, "logps/chosen": -56.6843147277832, "logps/rejected": -52.8115234375, "loss": 0.692, "rewards/accuracies": 0.59375, "rewards/chosen": -0.002397857140749693, "rewards/margins": 0.0022247000597417355, "rewards/rejected": -0.004622557200491428, "step": 650 }, { "epoch": 0.4548587181254307, "grad_norm": 1.1715619564056396, "learning_rate": 4.9971780964238976e-08, "logits/chosen": -3.0463173389434814, "logits/rejected": -3.015166759490967, "logps/chosen": -53.899269104003906, "logps/rejected": -51.60932159423828, "loss": 0.6917, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.0026077921502292156, "rewards/margins": 0.0028277833480387926, "rewards/rejected": -0.005435575731098652, "step": 660 }, { "epoch": 0.461750516884907, "grad_norm": 1.2758429050445557, "learning_rate": 4.996418656298486e-08, "logits/chosen": -3.018087863922119, "logits/rejected": -2.9889063835144043, "logps/chosen": -55.94907760620117, "logps/rejected": -53.60669708251953, "loss": 0.6918, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.002205336932092905, "rewards/margins": 0.0026536956429481506, "rewards/rejected": -0.004859032575041056, "step": 670 }, { "epoch": 0.4686423156443832, "grad_norm": 1.1543318033218384, "learning_rate": 4.995568897862825e-08, "logits/chosen": -3.050879955291748, "logits/rejected": -3.034536600112915, "logps/chosen": -54.130043029785156, "logps/rejected": -53.64136505126953, "loss": 0.692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0028859872836619616, "rewards/margins": 0.002336550736799836, "rewards/rejected": -0.005222538020461798, "step": 680 }, { "epoch": 0.4755341144038594, "grad_norm": 1.1920276880264282, "learning_rate": 4.994628851860456e-08, "logits/chosen": -3.032226085662842, "logits/rejected": -3.0160422325134277, "logps/chosen": -54.02294158935547, "logps/rejected": -53.747314453125, "loss": 0.6919, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.002817694563418627, "rewards/margins": 0.0025537677574902773, "rewards/rejected": -0.005371461622416973, "step": 690 }, { "epoch": 0.4824259131633356, "grad_norm": 1.2519989013671875, "learning_rate": 4.993598552301446e-08, "logits/chosen": -3.0840821266174316, "logits/rejected": -3.054109573364258, "logps/chosen": -55.22893524169922, "logps/rejected": -53.15040969848633, "loss": 0.692, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.002690562279894948, "rewards/margins": 0.002303097629919648, "rewards/rejected": -0.004993659444153309, "step": 700 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.1578054428100586, "eval_logits/rejected": -3.1521644592285156, "eval_logps/chosen": -58.56085205078125, "eval_logps/rejected": -63.09734344482422, "eval_loss": 0.692812442779541, "eval_rewards/accuracies": 0.5515799522399902, "eval_rewards/chosen": 0.0015104113845154643, "eval_rewards/margins": 0.0006825894815847278, "eval_rewards/rejected": 0.0008278219029307365, "eval_runtime": 382.6608, "eval_samples_per_second": 11.248, "eval_steps_per_second": 1.406, "step": 700 }, { "epoch": 0.48931771192281187, "grad_norm": 1.1324511766433716, "learning_rate": 4.9924780364611574e-08, "logits/chosen": -3.0304083824157715, "logits/rejected": -3.01446270942688, "logps/chosen": -54.75562286376953, "logps/rejected": -54.3417854309082, "loss": 0.6919, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.003056741552427411, "rewards/margins": 0.0025413241237401962, "rewards/rejected": -0.005598065908998251, "step": 710 }, { "epoch": 0.4962095106822881, "grad_norm": 1.164832353591919, "learning_rate": 4.9912673448789055e-08, "logits/chosen": -3.0854406356811523, "logits/rejected": -3.0618083477020264, "logps/chosen": -54.96254348754883, "logps/rejected": -53.392417907714844, "loss": 0.6912, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0029385548550635576, "rewards/margins": 0.004011799581348896, "rewards/rejected": -0.006950353737920523, "step": 720 }, { "epoch": 0.5031013094417643, "grad_norm": 1.2162679433822632, "learning_rate": 4.989966521356484e-08, "logits/chosen": -3.112225294113159, "logits/rejected": -3.104539155960083, "logps/chosen": -54.104270935058594, "logps/rejected": -54.43672561645508, "loss": 0.6919, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.003235139651224017, "rewards/margins": 0.0024808451998978853, "rewards/rejected": -0.0057159848511219025, "step": 730 }, { "epoch": 0.5099931082012406, "grad_norm": 1.2919895648956299, "learning_rate": 4.9885756129565855e-08, "logits/chosen": -3.0732569694519043, "logits/rejected": -3.045283555984497, "logps/chosen": -53.6169319152832, "logps/rejected": -52.88599395751953, "loss": 0.6915, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0032176957465708256, "rewards/margins": 0.0034366168547421694, "rewards/rejected": -0.006654313299804926, "step": 740 }, { "epoch": 0.5168849069607168, "grad_norm": 1.1294078826904297, "learning_rate": 4.9870946700010963e-08, "logits/chosen": -3.0247771739959717, "logits/rejected": -3.002697467803955, "logps/chosen": -54.74202346801758, "logps/rejected": -53.9817008972168, "loss": 0.6912, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.002922980347648263, "rewards/margins": 0.003884477075189352, "rewards/rejected": -0.006807458586990833, "step": 750 }, { "epoch": 0.523776705720193, "grad_norm": 1.1451152563095093, "learning_rate": 4.985523746069277e-08, "logits/chosen": -3.123654365539551, "logits/rejected": -3.102745532989502, "logps/chosen": -54.486412048339844, "logps/rejected": -53.88116455078125, "loss": 0.6916, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -0.004141902085393667, "rewards/margins": 0.003131680190563202, "rewards/rejected": -0.007273583207279444, "step": 760 }, { "epoch": 0.5306685044796692, "grad_norm": 1.1648293733596802, "learning_rate": 4.9838628979958226e-08, "logits/chosen": -3.0169625282287598, "logits/rejected": -2.998164653778076, "logps/chosen": -54.984519958496094, "logps/rejected": -55.334449768066406, "loss": 0.692, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -0.004757951479405165, "rewards/margins": 0.0024256636388599873, "rewards/rejected": -0.007183616049587727, "step": 770 }, { "epoch": 0.5375603032391454, "grad_norm": 1.2733426094055176, "learning_rate": 4.982112185868809e-08, "logits/chosen": -3.0358846187591553, "logits/rejected": -3.020009756088257, "logps/chosen": -53.98040008544922, "logps/rejected": -54.75188446044922, "loss": 0.6917, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0044146752916276455, "rewards/margins": 0.0030339283403009176, "rewards/rejected": -0.007448603864759207, "step": 780 }, { "epoch": 0.5444521019986216, "grad_norm": 1.1421672105789185, "learning_rate": 4.980271673027517e-08, "logits/chosen": -3.043078899383545, "logits/rejected": -3.0224287509918213, "logps/chosen": -53.47418975830078, "logps/rejected": -54.137420654296875, "loss": 0.6911, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.004008511081337929, "rewards/margins": 0.004073344171047211, "rewards/rejected": -0.00808185525238514, "step": 790 }, { "epoch": 0.5513439007580979, "grad_norm": 1.1940661668777466, "learning_rate": 4.9783414260601395e-08, "logits/chosen": -3.0741262435913086, "logits/rejected": -3.0514583587646484, "logps/chosen": -54.605628967285156, "logps/rejected": -53.26616287231445, "loss": 0.6911, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.004125869832932949, "rewards/margins": 0.004256745334714651, "rewards/rejected": -0.008382615633308887, "step": 800 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.155327558517456, "eval_logits/rejected": -3.149695634841919, "eval_logps/chosen": -58.53166580200195, "eval_logps/rejected": -63.11719512939453, "eval_loss": 0.6925740838050842, "eval_rewards/accuracies": 0.5634293556213379, "eval_rewards/chosen": 0.001802256447263062, "eval_rewards/margins": 0.0011729516554623842, "eval_rewards/rejected": 0.0006293050828389823, "eval_runtime": 382.7077, "eval_samples_per_second": 11.246, "eval_steps_per_second": 1.406, "step": 800 }, { "epoch": 0.5582356995175741, "grad_norm": 1.173479437828064, "learning_rate": 4.976321514801376e-08, "logits/chosen": -3.0321974754333496, "logits/rejected": -3.0146262645721436, "logps/chosen": -54.1708869934082, "logps/rejected": -54.401588439941406, "loss": 0.6916, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.004985693376511335, "rewards/margins": 0.003273946000263095, "rewards/rejected": -0.008259640075266361, "step": 810 }, { "epoch": 0.5651274982770503, "grad_norm": 1.1598631143569946, "learning_rate": 4.974212012329902e-08, "logits/chosen": -3.0929911136627197, "logits/rejected": -3.0796961784362793, "logps/chosen": -52.722068786621094, "logps/rejected": -55.03529739379883, "loss": 0.6912, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005575023125857115, "rewards/margins": 0.003944667521864176, "rewards/rejected": -0.00951969064772129, "step": 820 }, { "epoch": 0.5720192970365265, "grad_norm": 1.2282392978668213, "learning_rate": 4.97201299496573e-08, "logits/chosen": -3.068333148956299, "logits/rejected": -3.0388402938842773, "logps/chosen": -54.16389846801758, "logps/rejected": -52.590980529785156, "loss": 0.6906, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.004416106268763542, "rewards/margins": 0.005132976919412613, "rewards/rejected": -0.009549083188176155, "step": 830 }, { "epoch": 0.5789110957960028, "grad_norm": 1.15338134765625, "learning_rate": 4.969724542267442e-08, "logits/chosen": -3.0817673206329346, "logits/rejected": -3.0567972660064697, "logps/chosen": -55.625160217285156, "logps/rejected": -55.353843688964844, "loss": 0.6902, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.004450449254363775, "rewards/margins": 0.0060601914301514626, "rewards/rejected": -0.01051064021885395, "step": 840 }, { "epoch": 0.585802894555479, "grad_norm": 1.1694824695587158, "learning_rate": 4.967346737029316e-08, "logits/chosen": -3.084930896759033, "logits/rejected": -3.057694435119629, "logps/chosen": -53.77745819091797, "logps/rejected": -52.777015686035156, "loss": 0.6907, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.005273155402392149, "rewards/margins": 0.0050581167452037334, "rewards/rejected": -0.010331272147595882, "step": 850 }, { "epoch": 0.5926946933149552, "grad_norm": 1.224703073501587, "learning_rate": 4.964879665278331e-08, "logits/chosen": -3.0363848209381104, "logits/rejected": -3.013810634613037, "logps/chosen": -52.17908477783203, "logps/rejected": -53.07768630981445, "loss": 0.6906, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.006155041512101889, "rewards/margins": 0.005122465081512928, "rewards/rejected": -0.011277507059276104, "step": 860 }, { "epoch": 0.5995864920744314, "grad_norm": 1.2663919925689697, "learning_rate": 4.9623234162710505e-08, "logits/chosen": -3.1284661293029785, "logits/rejected": -3.10298490524292, "logps/chosen": -56.10413360595703, "logps/rejected": -54.723472595214844, "loss": 0.6905, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.004832122474908829, "rewards/margins": 0.005487448535859585, "rewards/rejected": -0.010319570079445839, "step": 870 }, { "epoch": 0.6064782908339077, "grad_norm": 1.2287334203720093, "learning_rate": 4.959678082490396e-08, "logits/chosen": -3.0056588649749756, "logits/rejected": -2.9849095344543457, "logps/chosen": -54.50261688232422, "logps/rejected": -54.28424835205078, "loss": 0.6908, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007092119660228491, "rewards/margins": 0.00476409774273634, "rewards/rejected": -0.011856217868626118, "step": 880 }, { "epoch": 0.6133700895933839, "grad_norm": 1.2229245901107788, "learning_rate": 4.9569437596423006e-08, "logits/chosen": -3.051978826522827, "logits/rejected": -3.0254709720611572, "logps/chosen": -54.968544006347656, "logps/rejected": -53.611793518066406, "loss": 0.6903, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.006352136842906475, "rewards/margins": 0.005814626347273588, "rewards/rejected": -0.012166764587163925, "step": 890 }, { "epoch": 0.6202618883528601, "grad_norm": 1.1797666549682617, "learning_rate": 4.954120546652246e-08, "logits/chosen": -3.0876083374023438, "logits/rejected": -3.062818765640259, "logps/chosen": -56.6866340637207, "logps/rejected": -55.2923698425293, "loss": 0.6903, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.006517441011965275, "rewards/margins": 0.005804947577416897, "rewards/rejected": -0.012322388589382172, "step": 900 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -3.151287794113159, "eval_logits/rejected": -3.145599365234375, "eval_logps/chosen": -58.52416229248047, "eval_logps/rejected": -63.163387298583984, "eval_loss": 0.6923166513442993, "eval_rewards/accuracies": 0.5641263723373413, "eval_rewards/chosen": 0.0018773017218336463, "eval_rewards/margins": 0.0017099250108003616, "eval_rewards/rejected": 0.0001673765800660476, "eval_runtime": 382.7494, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 900 }, { "epoch": 0.6271536871123363, "grad_norm": 1.2264323234558105, "learning_rate": 4.9512085456616845e-08, "logits/chosen": -3.0514819622039795, "logits/rejected": -3.0357444286346436, "logps/chosen": -53.96470260620117, "logps/rejected": -54.99443435668945, "loss": 0.6902, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.006684791296720505, "rewards/margins": 0.006000310182571411, "rewards/rejected": -0.012685100547969341, "step": 910 }, { "epoch": 0.6340454858718125, "grad_norm": 1.1381434202194214, "learning_rate": 4.948207862024345e-08, "logits/chosen": -3.060519218444824, "logits/rejected": -3.0369343757629395, "logps/chosen": -53.765953063964844, "logps/rejected": -53.9163932800293, "loss": 0.6903, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.006491341628134251, "rewards/margins": 0.005870668683201075, "rewards/rejected": -0.012362010776996613, "step": 920 }, { "epoch": 0.6409372846312887, "grad_norm": 1.2366605997085571, "learning_rate": 4.9451186043024136e-08, "logits/chosen": -3.0556445121765137, "logits/rejected": -3.038353443145752, "logps/chosen": -54.4589729309082, "logps/rejected": -57.08545684814453, "loss": 0.6894, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.00594800990074873, "rewards/margins": 0.007657175417989492, "rewards/rejected": -0.013605184853076935, "step": 930 }, { "epoch": 0.647829083390765, "grad_norm": 1.225424885749817, "learning_rate": 4.941940884262618e-08, "logits/chosen": -3.0791478157043457, "logits/rejected": -3.0592617988586426, "logps/chosen": -55.0597038269043, "logps/rejected": -55.3946533203125, "loss": 0.6904, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.00773457158356905, "rewards/margins": 0.005577114410698414, "rewards/rejected": -0.013311685994267464, "step": 940 }, { "epoch": 0.6547208821502413, "grad_norm": 1.255478024482727, "learning_rate": 4.938674816872173e-08, "logits/chosen": -3.0655503273010254, "logits/rejected": -3.034986972808838, "logps/chosen": -55.53010177612305, "logps/rejected": -53.302825927734375, "loss": 0.69, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.007046771701425314, "rewards/margins": 0.006458138581365347, "rewards/rejected": -0.01350491028279066, "step": 950 }, { "epoch": 0.6616126809097175, "grad_norm": 1.2846956253051758, "learning_rate": 4.935320520294628e-08, "logits/chosen": -3.072808027267456, "logits/rejected": -3.0424203872680664, "logps/chosen": -56.58784866333008, "logps/rejected": -55.2563591003418, "loss": 0.6889, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.005053953267633915, "rewards/margins": 0.008650724776089191, "rewards/rejected": -0.013704678043723106, "step": 960 }, { "epoch": 0.6685044796691937, "grad_norm": 1.125267505645752, "learning_rate": 4.931878115885591e-08, "logits/chosen": -3.1181387901306152, "logits/rejected": -3.102503776550293, "logps/chosen": -56.124900817871094, "logps/rejected": -55.270362854003906, "loss": 0.6908, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.008215638808906078, "rewards/margins": 0.004827216267585754, "rewards/rejected": -0.013042854145169258, "step": 970 }, { "epoch": 0.6753962784286699, "grad_norm": 1.1747466325759888, "learning_rate": 4.9283477281883315e-08, "logits/chosen": -3.1210827827453613, "logits/rejected": -3.0842177867889404, "logps/chosen": -56.9710578918457, "logps/rejected": -53.4248161315918, "loss": 0.6887, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.006396190263330936, "rewards/margins": 0.009193857200443745, "rewards/rejected": -0.015590047463774681, "step": 980 }, { "epoch": 0.6822880771881461, "grad_norm": 1.293439269065857, "learning_rate": 4.9247294849292856e-08, "logits/chosen": -3.0506205558776855, "logits/rejected": -3.033032178878784, "logps/chosen": -53.939781188964844, "logps/rejected": -55.64631271362305, "loss": 0.6896, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.0070620314218103886, "rewards/margins": 0.007346457336097956, "rewards/rejected": -0.014408488757908344, "step": 990 }, { "epoch": 0.6891798759476223, "grad_norm": 1.2241309881210327, "learning_rate": 4.9210235170134244e-08, "logits/chosen": -3.080838441848755, "logits/rejected": -3.0573782920837402, "logps/chosen": -55.31145095825195, "logps/rejected": -55.0872917175293, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": -0.008531216531991959, "rewards/margins": 0.006765308324247599, "rewards/rejected": -0.015296523459255695, "step": 1000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -3.1467225551605225, "eval_logits/rejected": -3.141096830368042, "eval_logps/chosen": -58.55016326904297, "eval_logps/rejected": -63.2556037902832, "eval_loss": 0.692000687122345, "eval_rewards/accuracies": 0.5676115155220032, "eval_rewards/chosen": 0.0016172927571460605, "eval_rewards/margins": 0.0023720760364085436, "eval_rewards/rejected": -0.000754783337470144, "eval_runtime": 383.6651, "eval_samples_per_second": 11.218, "eval_steps_per_second": 1.402, "step": 1000 }, { "epoch": 0.6960716747070985, "grad_norm": 1.2173606157302856, "learning_rate": 4.917229958519526e-08, "logits/chosen": -3.0693936347961426, "logits/rejected": -3.0550990104675293, "logps/chosen": -53.986412048339844, "logps/rejected": -55.511878967285156, "loss": 0.69, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.008736327290534973, "rewards/margins": 0.006421979516744614, "rewards/rejected": -0.015158305875957012, "step": 1010 }, { "epoch": 0.7029634734665747, "grad_norm": 1.2777844667434692, "learning_rate": 4.9133489466953204e-08, "logits/chosen": -3.041337251663208, "logits/rejected": -3.0213141441345215, "logps/chosen": -54.19426345825195, "logps/rejected": -54.7755012512207, "loss": 0.6894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010300601832568645, "rewards/margins": 0.0077950237318873405, "rewards/rejected": -0.018095625564455986, "step": 1020 }, { "epoch": 0.709855272226051, "grad_norm": 1.2164298295974731, "learning_rate": 4.909380621952524e-08, "logits/chosen": -3.0458850860595703, "logits/rejected": -3.0286898612976074, "logps/chosen": -54.57402420043945, "logps/rejected": -54.290138244628906, "loss": 0.6893, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.0098701948300004, "rewards/margins": 0.00787823274731636, "rewards/rejected": -0.017748426645994186, "step": 1030 }, { "epoch": 0.7167470709855273, "grad_norm": 1.1390899419784546, "learning_rate": 4.9053251278617604e-08, "logits/chosen": -3.0760250091552734, "logits/rejected": -3.0482988357543945, "logps/chosen": -55.69697952270508, "logps/rejected": -52.95030975341797, "loss": 0.6885, "rewards/accuracies": 0.65625, "rewards/chosen": -0.009436179883778095, "rewards/margins": 0.009637376293540001, "rewards/rejected": -0.01907355524599552, "step": 1040 }, { "epoch": 0.7236388697450035, "grad_norm": 1.23283851146698, "learning_rate": 4.9011826111473685e-08, "logits/chosen": -3.062562942504883, "logits/rejected": -3.0491833686828613, "logps/chosen": -54.286033630371094, "logps/rejected": -55.36510467529297, "loss": 0.6897, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.009069805033504963, "rewards/margins": 0.007122306618839502, "rewards/rejected": -0.016192112118005753, "step": 1050 }, { "epoch": 0.7305306685044797, "grad_norm": 1.2886954545974731, "learning_rate": 4.89695322168209e-08, "logits/chosen": -3.030677318572998, "logits/rejected": -3.008669137954712, "logps/chosen": -55.459442138671875, "logps/rejected": -55.95746612548828, "loss": 0.6883, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.010680411010980606, "rewards/margins": 0.010059915482997894, "rewards/rejected": -0.0207403264939785, "step": 1060 }, { "epoch": 0.7374224672639559, "grad_norm": 1.175155520439148, "learning_rate": 4.89263711248165e-08, "logits/chosen": -3.0101096630096436, "logits/rejected": -2.9888839721679688, "logps/chosen": -54.219993591308594, "logps/rejected": -54.440582275390625, "loss": 0.6887, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009942619130015373, "rewards/margins": 0.009151456877589226, "rewards/rejected": -0.0190940760076046, "step": 1070 }, { "epoch": 0.7443142660234321, "grad_norm": 1.1949468851089478, "learning_rate": 4.8882344396992184e-08, "logits/chosen": -3.0487170219421387, "logits/rejected": -3.0279972553253174, "logps/chosen": -56.165000915527344, "logps/rejected": -54.65681076049805, "loss": 0.6889, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.010424690321087837, "rewards/margins": 0.008783365599811077, "rewards/rejected": -0.01920805498957634, "step": 1080 }, { "epoch": 0.7512060647829083, "grad_norm": 1.1826523542404175, "learning_rate": 4.883745362619765e-08, "logits/chosen": -3.013294219970703, "logits/rejected": -2.991460084915161, "logps/chosen": -53.631080627441406, "logps/rejected": -53.42609786987305, "loss": 0.6893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013389279134571552, "rewards/margins": 0.008029376156628132, "rewards/rejected": -0.021418655291199684, "step": 1090 }, { "epoch": 0.7580978635423845, "grad_norm": 1.185375452041626, "learning_rate": 4.8791700436542915e-08, "logits/chosen": -3.054615020751953, "logits/rejected": -3.0370595455169678, "logps/chosen": -56.0067253112793, "logps/rejected": -55.825843811035156, "loss": 0.6898, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.011557132005691528, "rewards/margins": 0.006960131227970123, "rewards/rejected": -0.018517261371016502, "step": 1100 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -3.1415205001831055, "eval_logits/rejected": -3.135866165161133, "eval_logps/chosen": -58.604042053222656, "eval_logps/rejected": -63.392513275146484, "eval_loss": 0.6916031837463379, "eval_rewards/accuracies": 0.580157995223999, "eval_rewards/chosen": 0.0010785621125251055, "eval_rewards/margins": 0.0032024432439357042, "eval_rewards/rejected": -0.0021238811314105988, "eval_runtime": 383.4073, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 1100 }, { "epoch": 0.7649896623018608, "grad_norm": 1.2535617351531982, "learning_rate": 4.874508648333959e-08, "logits/chosen": -3.05568265914917, "logits/rejected": -3.0244393348693848, "logps/chosen": -56.923561096191406, "logps/rejected": -55.1549072265625, "loss": 0.6884, "rewards/accuracies": 0.609375, "rewards/chosen": -0.012004630640149117, "rewards/margins": 0.009933307766914368, "rewards/rejected": -0.021937940269708633, "step": 1110 }, { "epoch": 0.771881461061337, "grad_norm": 1.1702125072479248, "learning_rate": 4.8697613453040974e-08, "logits/chosen": -3.064448833465576, "logits/rejected": -3.050039052963257, "logps/chosen": -55.2637825012207, "logps/rejected": -54.52691650390625, "loss": 0.6895, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.014187818393111229, "rewards/margins": 0.007751249708235264, "rewards/rejected": -0.021939069032669067, "step": 1120 }, { "epoch": 0.7787732598208132, "grad_norm": 1.2226091623306274, "learning_rate": 4.864928306318104e-08, "logits/chosen": -3.072038412094116, "logits/rejected": -3.04901123046875, "logps/chosen": -54.69261932373047, "logps/rejected": -54.283836364746094, "loss": 0.6891, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.013159675523638725, "rewards/margins": 0.00843111053109169, "rewards/rejected": -0.021590787917375565, "step": 1130 }, { "epoch": 0.7856650585802895, "grad_norm": 1.1798851490020752, "learning_rate": 4.860009706231234e-08, "logits/chosen": -3.033435821533203, "logits/rejected": -3.0129566192626953, "logps/chosen": -55.970245361328125, "logps/rejected": -54.931739807128906, "loss": 0.6883, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0119981300085783, "rewards/margins": 0.01007566973567009, "rewards/rejected": -0.02207379788160324, "step": 1140 }, { "epoch": 0.7925568573397657, "grad_norm": 1.1462080478668213, "learning_rate": 4.8550057229942654e-08, "logits/chosen": -3.0121278762817383, "logits/rejected": -2.9870219230651855, "logps/chosen": -54.78216552734375, "logps/rejected": -54.829002380371094, "loss": 0.6887, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.013823253102600574, "rewards/margins": 0.009285924956202507, "rewards/rejected": -0.023109178990125656, "step": 1150 }, { "epoch": 0.7994486560992419, "grad_norm": 1.2008706331253052, "learning_rate": 4.849916537647071e-08, "logits/chosen": -3.041491985321045, "logits/rejected": -3.0206379890441895, "logps/chosen": -56.19243240356445, "logps/rejected": -57.039947509765625, "loss": 0.6887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014686907641589642, "rewards/margins": 0.009198293089866638, "rewards/rejected": -0.023885199800133705, "step": 1160 }, { "epoch": 0.8063404548587181, "grad_norm": 1.1627824306488037, "learning_rate": 4.844742334312059e-08, "logits/chosen": -3.056154489517212, "logits/rejected": -3.028928279876709, "logps/chosen": -54.64860916137695, "logps/rejected": -53.8159294128418, "loss": 0.6881, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.01304612122476101, "rewards/margins": 0.010500517673790455, "rewards/rejected": -0.02354663982987404, "step": 1170 }, { "epoch": 0.8132322536181944, "grad_norm": 1.244218349456787, "learning_rate": 4.8394833001875206e-08, "logits/chosen": -3.057952880859375, "logits/rejected": -3.033057689666748, "logps/chosen": -56.92658615112305, "logps/rejected": -56.18558883666992, "loss": 0.6878, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.013648271560668945, "rewards/margins": 0.01119085680693388, "rewards/rejected": -0.0248391292989254, "step": 1180 }, { "epoch": 0.8201240523776706, "grad_norm": 1.2377841472625732, "learning_rate": 4.834139625540851e-08, "logits/chosen": -3.0253548622131348, "logits/rejected": -3.0057997703552246, "logps/chosen": -55.36137008666992, "logps/rejected": -55.09904861450195, "loss": 0.6886, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.013483904302120209, "rewards/margins": 0.009393163025379181, "rewards/rejected": -0.022877071052789688, "step": 1190 }, { "epoch": 0.8270158511371468, "grad_norm": 1.2726726531982422, "learning_rate": 4.828711503701667e-08, "logits/chosen": -3.032179594039917, "logits/rejected": -3.011740207672119, "logps/chosen": -56.722267150878906, "logps/rejected": -55.44092559814453, "loss": 0.689, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.01510487962514162, "rewards/margins": 0.008657781407237053, "rewards/rejected": -0.023762661963701248, "step": 1200 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -3.137138843536377, "eval_logits/rejected": -3.131552219390869, "eval_logps/chosen": -58.70986557006836, "eval_logps/rejected": -63.55654525756836, "eval_loss": 0.6913304328918457, "eval_rewards/accuracies": 0.5752788186073303, "eval_rewards/chosen": 2.031068470387254e-05, "eval_rewards/margins": 0.0037846206687390804, "eval_rewards/rejected": -0.003764310386031866, "eval_runtime": 383.4775, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 1200 }, { "epoch": 0.833907649896623, "grad_norm": 1.3008631467819214, "learning_rate": 4.823199131054816e-08, "logits/chosen": -3.0267539024353027, "logits/rejected": -3.0021111965179443, "logps/chosen": -56.07625198364258, "logps/rejected": -54.91737747192383, "loss": 0.6881, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.015460239723324776, "rewards/margins": 0.010632393881678581, "rewards/rejected": -0.026092633605003357, "step": 1210 }, { "epoch": 0.8407994486560992, "grad_norm": 1.1622722148895264, "learning_rate": 4.8176027070332646e-08, "logits/chosen": -3.070613145828247, "logits/rejected": -3.042210578918457, "logps/chosen": -57.35967254638672, "logps/rejected": -54.246551513671875, "loss": 0.6878, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.015492687933146954, "rewards/margins": 0.011219398118555546, "rewards/rejected": -0.0267120860517025, "step": 1220 }, { "epoch": 0.8476912474155754, "grad_norm": 1.212531328201294, "learning_rate": 4.811922434110889e-08, "logits/chosen": -3.0541586875915527, "logits/rejected": -3.027801036834717, "logps/chosen": -56.26640701293945, "logps/rejected": -55.337791442871094, "loss": 0.6867, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.015658894553780556, "rewards/margins": 0.013525622896850109, "rewards/rejected": -0.02918451651930809, "step": 1230 }, { "epoch": 0.8545830461750517, "grad_norm": 1.2175966501235962, "learning_rate": 4.806158517795148e-08, "logits/chosen": -3.0408217906951904, "logits/rejected": -3.02706241607666, "logps/chosen": -55.906227111816406, "logps/rejected": -58.201499938964844, "loss": 0.6889, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.018697407096624374, "rewards/margins": 0.008898923173546791, "rewards/rejected": -0.027596330270171165, "step": 1240 }, { "epoch": 0.8614748449345279, "grad_norm": 1.3008414506912231, "learning_rate": 4.800311166619646e-08, "logits/chosen": -3.0177557468414307, "logits/rejected": -3.006732940673828, "logps/chosen": -54.774925231933594, "logps/rejected": -56.1735725402832, "loss": 0.6895, "rewards/accuracies": 0.567187488079071, "rewards/chosen": -0.01820097118616104, "rewards/margins": 0.0077009438537061214, "rewards/rejected": -0.0259019136428833, "step": 1250 }, { "epoch": 0.8683666436940042, "grad_norm": 1.269448161125183, "learning_rate": 4.794380592136591e-08, "logits/chosen": -3.0406134128570557, "logits/rejected": -3.0144009590148926, "logps/chosen": -55.88134002685547, "logps/rejected": -54.799659729003906, "loss": 0.6878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018001314252614975, "rewards/margins": 0.011205502785742283, "rewards/rejected": -0.029206816107034683, "step": 1260 }, { "epoch": 0.8752584424534804, "grad_norm": 1.2198677062988281, "learning_rate": 4.788367008909139e-08, "logits/chosen": -3.0545315742492676, "logits/rejected": -3.0307886600494385, "logps/chosen": -55.82643508911133, "logps/rejected": -54.3618278503418, "loss": 0.6875, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.017442066222429276, "rewards/margins": 0.011773045174777508, "rewards/rejected": -0.029215116053819656, "step": 1270 }, { "epoch": 0.8821502412129566, "grad_norm": 1.2116971015930176, "learning_rate": 4.782270634503631e-08, "logits/chosen": -3.04874324798584, "logits/rejected": -3.0147624015808105, "logps/chosen": -57.036094665527344, "logps/rejected": -55.135292053222656, "loss": 0.6865, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.017601333558559418, "rewards/margins": 0.013879353180527687, "rewards/rejected": -0.031480688601732254, "step": 1280 }, { "epoch": 0.8890420399724328, "grad_norm": 1.1639325618743896, "learning_rate": 4.776091689481725e-08, "logits/chosen": -3.0866382122039795, "logits/rejected": -3.0574278831481934, "logps/chosen": -55.75977325439453, "logps/rejected": -54.65210723876953, "loss": 0.6863, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.017770206555724144, "rewards/margins": 0.014166529290378094, "rewards/rejected": -0.03193673864006996, "step": 1290 }, { "epoch": 0.895933838731909, "grad_norm": 1.2590343952178955, "learning_rate": 4.7698303973924136e-08, "logits/chosen": -3.0766139030456543, "logits/rejected": -3.0598435401916504, "logps/chosen": -55.7991943359375, "logps/rejected": -56.533660888671875, "loss": 0.6881, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.018945975229144096, "rewards/margins": 0.01061730831861496, "rewards/rejected": -0.029563283547759056, "step": 1300 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -3.1325011253356934, "eval_logits/rejected": -3.1268014907836914, "eval_logps/chosen": -58.86237716674805, "eval_logps/rejected": -63.79023742675781, "eval_loss": 0.6909503936767578, "eval_rewards/accuracies": 0.5803903341293335, "eval_rewards/chosen": -0.0015047783963382244, "eval_rewards/margins": 0.0045963493175804615, "eval_rewards/rejected": -0.006101127713918686, "eval_runtime": 384.151, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 1300 }, { "epoch": 0.9028256374913852, "grad_norm": 1.2107516527175903, "learning_rate": 4.7634869847639334e-08, "logits/chosen": -3.0417442321777344, "logits/rejected": -3.026449680328369, "logps/chosen": -54.883506774902344, "logps/rejected": -56.9136962890625, "loss": 0.6872, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.018119771033525467, "rewards/margins": 0.012413778342306614, "rewards/rejected": -0.030533546581864357, "step": 1310 }, { "epoch": 0.9097174362508614, "grad_norm": 1.2329145669937134, "learning_rate": 4.757061681095577e-08, "logits/chosen": -3.0578348636627197, "logits/rejected": -3.0322513580322266, "logps/chosen": -55.66331100463867, "logps/rejected": -54.4700813293457, "loss": 0.6876, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.022416267544031143, "rewards/margins": 0.011768505908548832, "rewards/rejected": -0.0341847725212574, "step": 1320 }, { "epoch": 0.9166092350103378, "grad_norm": 1.2309297323226929, "learning_rate": 4.750554718849381e-08, "logits/chosen": -3.0512232780456543, "logits/rejected": -3.0197510719299316, "logps/chosen": -58.36737060546875, "logps/rejected": -55.63842010498047, "loss": 0.6861, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.017069976776838303, "rewards/margins": 0.014708531089127064, "rewards/rejected": -0.03177850693464279, "step": 1330 }, { "epoch": 0.923501033769814, "grad_norm": 1.217519998550415, "learning_rate": 4.743966333441723e-08, "logits/chosen": -3.0251669883728027, "logits/rejected": -3.0018458366394043, "logps/chosen": -55.40571212768555, "logps/rejected": -56.7693977355957, "loss": 0.6869, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02137664332985878, "rewards/margins": 0.013183819130063057, "rewards/rejected": -0.034560464322566986, "step": 1340 }, { "epoch": 0.9303928325292902, "grad_norm": 1.2220336198806763, "learning_rate": 4.7372967632348016e-08, "logits/chosen": -3.0455212593078613, "logits/rejected": -3.016047954559326, "logps/chosen": -56.66341018676758, "logps/rejected": -56.11173629760742, "loss": 0.6857, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.019142866134643555, "rewards/margins": 0.01562575250864029, "rewards/rejected": -0.034768618643283844, "step": 1350 }, { "epoch": 0.9372846312887664, "grad_norm": 1.240531325340271, "learning_rate": 4.7305462495280103e-08, "logits/chosen": -3.0660104751586914, "logits/rejected": -3.0503532886505127, "logps/chosen": -57.38665008544922, "logps/rejected": -57.25360870361328, "loss": 0.6875, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.023473698645830154, "rewards/margins": 0.011975238099694252, "rewards/rejected": -0.03544893115758896, "step": 1360 }, { "epoch": 0.9441764300482426, "grad_norm": 1.2872986793518066, "learning_rate": 4.723715036549211e-08, "logits/chosen": -3.0243756771087646, "logits/rejected": -3.0061607360839844, "logps/chosen": -55.8559684753418, "logps/rejected": -55.590736389160156, "loss": 0.6872, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.024248816072940826, "rewards/margins": 0.01261775754392147, "rewards/rejected": -0.03686657175421715, "step": 1370 }, { "epoch": 0.9510682288077188, "grad_norm": 1.2915130853652954, "learning_rate": 4.7168033714458986e-08, "logits/chosen": -3.033684730529785, "logits/rejected": -3.0120129585266113, "logps/chosen": -56.7520751953125, "logps/rejected": -56.18404006958008, "loss": 0.6872, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.02145109511911869, "rewards/margins": 0.012504595331847668, "rewards/rejected": -0.033955689519643784, "step": 1380 }, { "epoch": 0.957960027567195, "grad_norm": 1.2548354864120483, "learning_rate": 4.7098115042762554e-08, "logits/chosen": -3.0042724609375, "logits/rejected": -2.9861717224121094, "logps/chosen": -56.3974723815918, "logps/rejected": -56.91527557373047, "loss": 0.6866, "rewards/accuracies": 0.625, "rewards/chosen": -0.0220785029232502, "rewards/margins": 0.013720555230975151, "rewards/rejected": -0.0357990600168705, "step": 1390 }, { "epoch": 0.9648518263266712, "grad_norm": 1.1763672828674316, "learning_rate": 4.702739688000106e-08, "logits/chosen": -3.055232524871826, "logits/rejected": -3.037727117538452, "logps/chosen": -55.6951904296875, "logps/rejected": -59.415077209472656, "loss": 0.6874, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.024004105478525162, "rewards/margins": 0.01218905858695507, "rewards/rejected": -0.036193158477544785, "step": 1400 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -3.126948595046997, "eval_logits/rejected": -3.1213085651397705, "eval_logps/chosen": -59.07985305786133, "eval_logps/rejected": -64.06278228759766, "eval_loss": 0.6907014846801758, "eval_rewards/accuracies": 0.5824813842773438, "eval_rewards/chosen": -0.003679573303088546, "eval_rewards/margins": 0.005147051997482777, "eval_rewards/rejected": -0.008826625533401966, "eval_runtime": 383.5219, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 1400 }, { "epoch": 0.9717436250861475, "grad_norm": 1.3487069606781006, "learning_rate": 4.695588178469768e-08, "logits/chosen": -3.0565574169158936, "logits/rejected": -3.0355658531188965, "logps/chosen": -56.48865509033203, "logps/rejected": -57.217254638671875, "loss": 0.6862, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.021701959893107414, "rewards/margins": 0.014641150832176208, "rewards/rejected": -0.036343105137348175, "step": 1410 }, { "epoch": 0.9786354238456237, "grad_norm": 1.2262191772460938, "learning_rate": 4.688357234420793e-08, "logits/chosen": -2.9956376552581787, "logits/rejected": -2.9775960445404053, "logps/chosen": -55.699180603027344, "logps/rejected": -55.796051025390625, "loss": 0.6873, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0262535959482193, "rewards/margins": 0.012488202191889286, "rewards/rejected": -0.03874180465936661, "step": 1420 }, { "epoch": 0.9855272226051, "grad_norm": 1.3601967096328735, "learning_rate": 4.681047117462605e-08, "logits/chosen": -3.0274033546447754, "logits/rejected": -3.0017144680023193, "logps/chosen": -57.95294189453125, "logps/rejected": -55.97968673706055, "loss": 0.684, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.022255446761846542, "rewards/margins": 0.019062072038650513, "rewards/rejected": -0.041317522525787354, "step": 1430 }, { "epoch": 0.9924190213645762, "grad_norm": 1.2903742790222168, "learning_rate": 4.673658092069036e-08, "logits/chosen": -3.0665290355682373, "logits/rejected": -3.0414271354675293, "logps/chosen": -58.47101974487305, "logps/rejected": -57.185943603515625, "loss": 0.6852, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.024692287668585777, "rewards/margins": 0.016749979928135872, "rewards/rejected": -0.04144226759672165, "step": 1440 }, { "epoch": 0.9993108201240524, "grad_norm": 1.2878719568252563, "learning_rate": 4.666190425568761e-08, "logits/chosen": -2.9909021854400635, "logits/rejected": -2.9653737545013428, "logps/chosen": -54.072975158691406, "logps/rejected": -57.27626419067383, "loss": 0.687, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02437172830104828, "rewards/margins": 0.013051750138401985, "rewards/rejected": -0.03742348030209541, "step": 1450 }, { "epoch": 1.0062026188835287, "grad_norm": 1.3313488960266113, "learning_rate": 4.658644388135622e-08, "logits/chosen": -3.045004367828369, "logits/rejected": -3.019038677215576, "logps/chosen": -55.79365158081055, "logps/rejected": -56.9145393371582, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025482362136244774, "rewards/margins": 0.017755242064595222, "rewards/rejected": -0.043237604200839996, "step": 1460 }, { "epoch": 1.0130944176430048, "grad_norm": 1.2588318586349487, "learning_rate": 4.651020252778855e-08, "logits/chosen": -3.0149521827697754, "logits/rejected": -2.9986729621887207, "logps/chosen": -55.90470504760742, "logps/rejected": -58.25142288208008, "loss": 0.6847, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.025019744411110878, "rewards/margins": 0.017768146470189095, "rewards/rejected": -0.04278789088129997, "step": 1470 }, { "epoch": 1.019986216402481, "grad_norm": 1.260459065437317, "learning_rate": 4.6433182953332116e-08, "logits/chosen": -3.020815372467041, "logits/rejected": -3.0085294246673584, "logps/chosen": -56.265525817871094, "logps/rejected": -60.046836853027344, "loss": 0.6863, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.02558632753789425, "rewards/margins": 0.01462565641850233, "rewards/rejected": -0.040211986750364304, "step": 1480 }, { "epoch": 1.0268780151619572, "grad_norm": 1.3650516271591187, "learning_rate": 4.635538794448982e-08, "logits/chosen": -2.988888740539551, "logits/rejected": -2.9630465507507324, "logps/chosen": -56.615989685058594, "logps/rejected": -56.81156539916992, "loss": 0.6845, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.024925198405981064, "rewards/margins": 0.01817614585161209, "rewards/rejected": -0.04310134798288345, "step": 1490 }, { "epoch": 1.0337698139214335, "grad_norm": 1.3529406785964966, "learning_rate": 4.627682031581913e-08, "logits/chosen": -2.9961295127868652, "logits/rejected": -2.978667974472046, "logps/chosen": -55.975120544433594, "logps/rejected": -57.098876953125, "loss": 0.6867, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.030640650540590286, "rewards/margins": 0.013602579943835735, "rewards/rejected": -0.0442432276904583, "step": 1500 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -3.119849443435669, "eval_logits/rejected": -3.114168167114258, "eval_logps/chosen": -59.33809280395508, "eval_logps/rejected": -64.41690063476562, "eval_loss": 0.6902598738670349, "eval_rewards/accuracies": 0.5843401551246643, "eval_rewards/chosen": -0.006262009963393211, "eval_rewards/margins": 0.0061058239080011845, "eval_rewards/rejected": -0.012367835268378258, "eval_runtime": 383.624, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 1500 }, { "epoch": 1.0406616126809096, "grad_norm": 1.1866661310195923, "learning_rate": 4.619748290983022e-08, "logits/chosen": -3.021925449371338, "logits/rejected": -3.001587390899658, "logps/chosen": -55.82487106323242, "logps/rejected": -55.3885383605957, "loss": 0.6862, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.030826866626739502, "rewards/margins": 0.014813138172030449, "rewards/rejected": -0.0456400066614151, "step": 1510 }, { "epoch": 1.047553411440386, "grad_norm": 1.2700278759002686, "learning_rate": 4.611737859688317e-08, "logits/chosen": -2.988671064376831, "logits/rejected": -2.965223550796509, "logps/chosen": -59.33045196533203, "logps/rejected": -57.386077880859375, "loss": 0.6847, "rewards/accuracies": 0.65625, "rewards/chosen": -0.028667423874139786, "rewards/margins": 0.017784085124731064, "rewards/rejected": -0.04645150899887085, "step": 1520 }, { "epoch": 1.0544452101998623, "grad_norm": 1.3180135488510132, "learning_rate": 4.6036510275084114e-08, "logits/chosen": -3.060955047607422, "logits/rejected": -3.0398271083831787, "logps/chosen": -56.164329528808594, "logps/rejected": -57.612144470214844, "loss": 0.6839, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.026113372296094894, "rewards/margins": 0.019537653774023056, "rewards/rejected": -0.04565102607011795, "step": 1530 }, { "epoch": 1.0613370089593384, "grad_norm": 1.2003244161605835, "learning_rate": 4.5954880870180344e-08, "logits/chosen": -2.999513626098633, "logits/rejected": -2.9835972785949707, "logps/chosen": -55.88873291015625, "logps/rejected": -56.8199577331543, "loss": 0.6858, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.03025154210627079, "rewards/margins": 0.01554082054644823, "rewards/rejected": -0.045792363584041595, "step": 1540 }, { "epoch": 1.0682288077188147, "grad_norm": 1.2452858686447144, "learning_rate": 4.587249333545453e-08, "logits/chosen": -3.0182111263275146, "logits/rejected": -2.9967448711395264, "logps/chosen": -60.4226188659668, "logps/rejected": -59.12889862060547, "loss": 0.6844, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02846052683889866, "rewards/margins": 0.018600499257445335, "rewards/rejected": -0.04706102982163429, "step": 1550 }, { "epoch": 1.0751206064782908, "grad_norm": 1.2952637672424316, "learning_rate": 4.578935065161782e-08, "logits/chosen": -3.0487442016601562, "logits/rejected": -3.0232508182525635, "logps/chosen": -58.391517639160156, "logps/rejected": -58.71136474609375, "loss": 0.684, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.028455624356865883, "rewards/margins": 0.01941332034766674, "rewards/rejected": -0.04786894470453262, "step": 1560 }, { "epoch": 1.082012405237767, "grad_norm": 1.2870752811431885, "learning_rate": 4.570545582670201e-08, "logits/chosen": -3.023087739944458, "logits/rejected": -3.0056240558624268, "logps/chosen": -57.30908203125, "logps/rejected": -57.751304626464844, "loss": 0.6847, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.03067805804312229, "rewards/margins": 0.017942866310477257, "rewards/rejected": -0.04862092435359955, "step": 1570 }, { "epoch": 1.0889042039972432, "grad_norm": 1.2773584127426147, "learning_rate": 4.5620811895950746e-08, "logits/chosen": -3.0028977394104004, "logits/rejected": -2.9747323989868164, "logps/chosen": -55.47496795654297, "logps/rejected": -57.35260772705078, "loss": 0.6846, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.032265856862068176, "rewards/margins": 0.01801823452115059, "rewards/rejected": -0.050284095108509064, "step": 1580 }, { "epoch": 1.0957960027567195, "grad_norm": 1.2767822742462158, "learning_rate": 4.553542192170966e-08, "logits/chosen": -2.9901621341705322, "logits/rejected": -2.977173328399658, "logps/chosen": -57.745277404785156, "logps/rejected": -59.282257080078125, "loss": 0.6853, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.032800786197185516, "rewards/margins": 0.016637884080410004, "rewards/rejected": -0.04943867027759552, "step": 1590 }, { "epoch": 1.1026878015161956, "grad_norm": 1.323087453842163, "learning_rate": 4.5449288993315615e-08, "logits/chosen": -3.056546211242676, "logits/rejected": -3.0385046005249023, "logps/chosen": -58.08835983276367, "logps/rejected": -59.3845100402832, "loss": 0.6857, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.031743548810482025, "rewards/margins": 0.015841025859117508, "rewards/rejected": -0.047584570944309235, "step": 1600 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -3.113739013671875, "eval_logits/rejected": -3.108053207397461, "eval_logps/chosen": -59.68599319458008, "eval_logps/rejected": -64.84294891357422, "eval_loss": 0.6899123191833496, "eval_rewards/accuracies": 0.5875929594039917, "eval_rewards/chosen": -0.009740971960127354, "eval_rewards/margins": 0.006887332070618868, "eval_rewards/rejected": -0.01662830449640751, "eval_runtime": 383.3773, "eval_samples_per_second": 11.227, "eval_steps_per_second": 1.403, "step": 1600 }, { "epoch": 1.109579600275672, "grad_norm": 1.2986916303634644, "learning_rate": 4.536241622698493e-08, "logits/chosen": -3.0435335636138916, "logits/rejected": -3.0242807865142822, "logps/chosen": -56.545982360839844, "logps/rejected": -56.66587448120117, "loss": 0.686, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.03344612196087837, "rewards/margins": 0.01548151858150959, "rewards/rejected": -0.04892764240503311, "step": 1610 }, { "epoch": 1.1164713990351482, "grad_norm": 1.2819620370864868, "learning_rate": 4.5274806765700636e-08, "logits/chosen": -2.979796886444092, "logits/rejected": -2.9572014808654785, "logps/chosen": -57.44049835205078, "logps/rejected": -58.2237434387207, "loss": 0.6833, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.030573686584830284, "rewards/margins": 0.020742163062095642, "rewards/rejected": -0.051315851509571075, "step": 1620 }, { "epoch": 1.1233631977946243, "grad_norm": 1.2927671670913696, "learning_rate": 4.518646377909875e-08, "logits/chosen": -3.0702857971191406, "logits/rejected": -3.047144651412964, "logps/chosen": -58.362464904785156, "logps/rejected": -57.96095657348633, "loss": 0.6846, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.03306771069765091, "rewards/margins": 0.018272345885634422, "rewards/rejected": -0.05134005472064018, "step": 1630 }, { "epoch": 1.1302549965541007, "grad_norm": 1.3534393310546875, "learning_rate": 4.5097390463353626e-08, "logits/chosen": -2.9942617416381836, "logits/rejected": -2.963628053665161, "logps/chosen": -57.976112365722656, "logps/rejected": -58.23329544067383, "loss": 0.6833, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.03338998928666115, "rewards/margins": 0.020668480545282364, "rewards/rejected": -0.05405846983194351, "step": 1640 }, { "epoch": 1.1371467953135768, "grad_norm": 1.2082114219665527, "learning_rate": 4.5007590041062295e-08, "logits/chosen": -3.046212911605835, "logits/rejected": -3.019162178039551, "logps/chosen": -57.87427520751953, "logps/rejected": -57.6834831237793, "loss": 0.6855, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.03608114644885063, "rewards/margins": 0.016333777457475662, "rewards/rejected": -0.0524149164557457, "step": 1650 }, { "epoch": 1.144038594073053, "grad_norm": 1.1970142126083374, "learning_rate": 4.4917065761127907e-08, "logits/chosen": -3.0264124870300293, "logits/rejected": -2.999277114868164, "logps/chosen": -56.5915412902832, "logps/rejected": -56.590354919433594, "loss": 0.6836, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.038979362696409225, "rewards/margins": 0.020165814086794853, "rewards/rejected": -0.05914517492055893, "step": 1660 }, { "epoch": 1.1509303928325294, "grad_norm": 1.3876991271972656, "learning_rate": 4.482582089864214e-08, "logits/chosen": -3.0508224964141846, "logits/rejected": -3.033642053604126, "logps/chosen": -55.689598083496094, "logps/rejected": -59.55400466918945, "loss": 0.6835, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.03788014501333237, "rewards/margins": 0.02047286555171013, "rewards/rejected": -0.0583530068397522, "step": 1670 }, { "epoch": 1.1578221915920055, "grad_norm": 1.2665168046951294, "learning_rate": 4.473385875476675e-08, "logits/chosen": -3.009286403656006, "logits/rejected": -2.9915988445281982, "logps/chosen": -58.77642059326172, "logps/rejected": -59.92441940307617, "loss": 0.6821, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03369423374533653, "rewards/margins": 0.023475242778658867, "rewards/rejected": -0.05716947838664055, "step": 1680 }, { "epoch": 1.1647139903514818, "grad_norm": 1.239821195602417, "learning_rate": 4.464118265661414e-08, "logits/chosen": -3.0169005393981934, "logits/rejected": -3.0001168251037598, "logps/chosen": -57.96710968017578, "logps/rejected": -59.11944580078125, "loss": 0.6857, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.04055698588490486, "rewards/margins": 0.016156669706106186, "rewards/rejected": -0.05671365186572075, "step": 1690 }, { "epoch": 1.171605789110958, "grad_norm": 1.2715541124343872, "learning_rate": 4.454779595712694e-08, "logits/chosen": -3.0080418586730957, "logits/rejected": -2.9879698753356934, "logps/chosen": -57.514808654785156, "logps/rejected": -59.12221145629883, "loss": 0.6843, "rewards/accuracies": 0.625, "rewards/chosen": -0.04195026680827141, "rewards/margins": 0.018841687589883804, "rewards/rejected": -0.060791950672864914, "step": 1700 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -3.1070287227630615, "eval_logits/rejected": -3.101304054260254, "eval_logps/chosen": -60.19528579711914, "eval_logps/rejected": -65.44676208496094, "eval_loss": 0.689507246017456, "eval_rewards/accuracies": 0.5803903341293335, "eval_rewards/chosen": -0.014833912253379822, "eval_rewards/margins": 0.007832462899386883, "eval_rewards/rejected": -0.02266637235879898, "eval_runtime": 383.4144, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 1700 }, { "epoch": 1.1784975878704342, "grad_norm": 1.3879843950271606, "learning_rate": 4.4453702034956785e-08, "logits/chosen": -3.0700769424438477, "logits/rejected": -3.049919605255127, "logps/chosen": -58.9970588684082, "logps/rejected": -59.037841796875, "loss": 0.6831, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.03606757894158363, "rewards/margins": 0.0213716309517622, "rewards/rejected": -0.05743921920657158, "step": 1710 }, { "epoch": 1.1853893866299103, "grad_norm": 1.34134042263031, "learning_rate": 4.435890429434197e-08, "logits/chosen": -2.989974021911621, "logits/rejected": -2.9621846675872803, "logps/chosen": -58.78081512451172, "logps/rejected": -58.66382598876953, "loss": 0.6823, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.04123959690332413, "rewards/margins": 0.02316473424434662, "rewards/rejected": -0.06440433114767075, "step": 1720 }, { "epoch": 1.1922811853893867, "grad_norm": 1.267427682876587, "learning_rate": 4.426340616498437e-08, "logits/chosen": -3.0574259757995605, "logits/rejected": -3.031238079071045, "logps/chosen": -58.63545608520508, "logps/rejected": -59.8019905090332, "loss": 0.6829, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.03921722248196602, "rewards/margins": 0.021796178072690964, "rewards/rejected": -0.06101340055465698, "step": 1730 }, { "epoch": 1.1991729841488628, "grad_norm": 1.3406956195831299, "learning_rate": 4.416721110192535e-08, "logits/chosen": -3.013367176055908, "logits/rejected": -2.9800286293029785, "logps/chosen": -60.035980224609375, "logps/rejected": -57.699485778808594, "loss": 0.6815, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.03842981159687042, "rewards/margins": 0.02466021105647087, "rewards/rejected": -0.063090018928051, "step": 1740 }, { "epoch": 1.206064782908339, "grad_norm": 1.298407793045044, "learning_rate": 4.407032258542071e-08, "logits/chosen": -3.0415241718292236, "logits/rejected": -3.0157630443573, "logps/chosen": -59.692420959472656, "logps/rejected": -59.681793212890625, "loss": 0.6818, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.04181189090013504, "rewards/margins": 0.024130940437316895, "rewards/rejected": -0.06594283878803253, "step": 1750 }, { "epoch": 1.2129565816678154, "grad_norm": 1.3731602430343628, "learning_rate": 4.3972744120814834e-08, "logits/chosen": -3.0049002170562744, "logits/rejected": -2.9822535514831543, "logps/chosen": -58.5940055847168, "logps/rejected": -60.961036682128906, "loss": 0.6819, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04359756037592888, "rewards/margins": 0.02393193170428276, "rewards/rejected": -0.06752948462963104, "step": 1760 }, { "epoch": 1.2198483804272915, "grad_norm": 1.3357396125793457, "learning_rate": 4.387447923841383e-08, "logits/chosen": -3.03355073928833, "logits/rejected": -3.009901523590088, "logps/chosen": -57.91057205200195, "logps/rejected": -60.24782180786133, "loss": 0.6831, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.047262683510780334, "rewards/margins": 0.02145593799650669, "rewards/rejected": -0.06871862709522247, "step": 1770 }, { "epoch": 1.2267401791867678, "grad_norm": 1.3635982275009155, "learning_rate": 4.377553149335783e-08, "logits/chosen": -3.0393521785736084, "logits/rejected": -3.0107617378234863, "logps/chosen": -61.046234130859375, "logps/rejected": -59.985321044921875, "loss": 0.6823, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.04533679410815239, "rewards/margins": 0.02301432564854622, "rewards/rejected": -0.0683511346578598, "step": 1780 }, { "epoch": 1.233631977946244, "grad_norm": 1.2849096059799194, "learning_rate": 4.367590446549234e-08, "logits/chosen": -3.0014703273773193, "logits/rejected": -2.977975368499756, "logps/chosen": -59.160911560058594, "logps/rejected": -60.51984405517578, "loss": 0.6829, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.047338586300611496, "rewards/margins": 0.022133398801088333, "rewards/rejected": -0.06947199255228043, "step": 1790 }, { "epoch": 1.2405237767057202, "grad_norm": 1.38184654712677, "learning_rate": 4.357560175923876e-08, "logits/chosen": -2.9890942573547363, "logits/rejected": -2.968458652496338, "logps/chosen": -59.32697677612305, "logps/rejected": -61.6325569152832, "loss": 0.6842, "rewards/accuracies": 0.609375, "rewards/chosen": -0.05167876556515694, "rewards/margins": 0.019454535096883774, "rewards/rejected": -0.07113330066204071, "step": 1800 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -3.100109577178955, "eval_logits/rejected": -3.094414234161377, "eval_logps/chosen": -60.904693603515625, "eval_logps/rejected": -66.26681518554688, "eval_loss": 0.689037024974823, "eval_rewards/accuracies": 0.5871282815933228, "eval_rewards/chosen": -0.02192796766757965, "eval_rewards/margins": 0.00893897283822298, "eval_rewards/rejected": -0.030866941437125206, "eval_runtime": 383.1969, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 1800 }, { "epoch": 1.2474155754651963, "grad_norm": 1.3076618909835815, "learning_rate": 4.347462700346395e-08, "logits/chosen": -3.002626419067383, "logits/rejected": -2.9803082942962646, "logps/chosen": -60.84148025512695, "logps/rejected": -60.021446228027344, "loss": 0.6845, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.05034772306680679, "rewards/margins": 0.01875271275639534, "rewards/rejected": -0.06910042464733124, "step": 1810 }, { "epoch": 1.2543073742246726, "grad_norm": 1.3908463716506958, "learning_rate": 4.337298385134896e-08, "logits/chosen": -3.0724968910217285, "logits/rejected": -3.039731979370117, "logps/chosen": -59.659080505371094, "logps/rejected": -60.214622497558594, "loss": 0.6806, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.04874369129538536, "rewards/margins": 0.02647136151790619, "rewards/rejected": -0.07521505653858185, "step": 1820 }, { "epoch": 1.2611991729841487, "grad_norm": 1.3290398120880127, "learning_rate": 4.327067598025686e-08, "logits/chosen": -2.9579367637634277, "logits/rejected": -2.9380383491516113, "logps/chosen": -58.9208984375, "logps/rejected": -59.662330627441406, "loss": 0.6858, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.05304742977023125, "rewards/margins": 0.01629539206624031, "rewards/rejected": -0.06934282183647156, "step": 1830 }, { "epoch": 1.268090971743625, "grad_norm": 1.3474442958831787, "learning_rate": 4.316770709159966e-08, "logits/chosen": -3.0216171741485596, "logits/rejected": -2.995469570159912, "logps/chosen": -59.756561279296875, "logps/rejected": -59.7965087890625, "loss": 0.681, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.04841332510113716, "rewards/margins": 0.025948714464902878, "rewards/rejected": -0.07436203211545944, "step": 1840 }, { "epoch": 1.2749827705031014, "grad_norm": 1.2983741760253906, "learning_rate": 4.306408091070445e-08, "logits/chosen": -2.9480409622192383, "logits/rejected": -2.9225194454193115, "logps/chosen": -58.772979736328125, "logps/rejected": -59.58469772338867, "loss": 0.6835, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.05363209918141365, "rewards/margins": 0.02092006430029869, "rewards/rejected": -0.07455216348171234, "step": 1850 }, { "epoch": 1.2818745692625775, "grad_norm": 1.3574970960617065, "learning_rate": 4.29598011866786e-08, "logits/chosen": -2.9424538612365723, "logits/rejected": -2.9136080741882324, "logps/chosen": -60.003875732421875, "logps/rejected": -60.62981033325195, "loss": 0.6808, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.051705311983823776, "rewards/margins": 0.026265686377882957, "rewards/rejected": -0.07797099649906158, "step": 1860 }, { "epoch": 1.2887663680220538, "grad_norm": 1.4326393604278564, "learning_rate": 4.285487169227408e-08, "logits/chosen": -2.99379301071167, "logits/rejected": -2.9764721393585205, "logps/chosen": -59.060340881347656, "logps/rejected": -58.82389450073242, "loss": 0.6836, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05216605216264725, "rewards/margins": 0.020805656909942627, "rewards/rejected": -0.07297170907258987, "step": 1870 }, { "epoch": 1.29565816678153, "grad_norm": 1.3941576480865479, "learning_rate": 4.2749296223751055e-08, "logits/chosen": -2.9548556804656982, "logits/rejected": -2.9317376613616943, "logps/chosen": -58.838478088378906, "logps/rejected": -60.188232421875, "loss": 0.6822, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.052825819700956345, "rewards/margins": 0.023545509204268456, "rewards/rejected": -0.07637132704257965, "step": 1880 }, { "epoch": 1.3025499655410062, "grad_norm": 1.3499064445495605, "learning_rate": 4.264307860074045e-08, "logits/chosen": -3.0092263221740723, "logits/rejected": -2.9972012042999268, "logps/chosen": -57.108421325683594, "logps/rejected": -60.75562286376953, "loss": 0.6821, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.050775300711393356, "rewards/margins": 0.02356244996190071, "rewards/rejected": -0.07433775067329407, "step": 1890 }, { "epoch": 1.3094417643004825, "grad_norm": 1.2492562532424927, "learning_rate": 4.253622266610579e-08, "logits/chosen": -3.0359554290771484, "logits/rejected": -3.0080196857452393, "logps/chosen": -61.222373962402344, "logps/rejected": -59.80623245239258, "loss": 0.6802, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.05304674431681633, "rewards/margins": 0.02771439589560032, "rewards/rejected": -0.0807611495256424, "step": 1900 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -3.0939745903015137, "eval_logits/rejected": -3.0882840156555176, "eval_logps/chosen": -61.343780517578125, "eval_logps/rejected": -66.79544067382812, "eval_loss": 0.6886445879936218, "eval_rewards/accuracies": 0.5920074582099915, "eval_rewards/chosen": -0.02631884254515171, "eval_rewards/margins": 0.00983431562781334, "eval_rewards/rejected": -0.0361531563103199, "eval_runtime": 383.3795, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 1900 }, { "epoch": 1.3163335630599586, "grad_norm": 1.4036768674850464, "learning_rate": 4.24287322858042e-08, "logits/chosen": -2.958085536956787, "logits/rejected": -2.940584659576416, "logps/chosen": -59.94134521484375, "logps/rejected": -60.20750045776367, "loss": 0.6829, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.053975123912096024, "rewards/margins": 0.021946988999843597, "rewards/rejected": -0.07592211663722992, "step": 1910 }, { "epoch": 1.323225361819435, "grad_norm": 1.3422105312347412, "learning_rate": 4.2320611348746484e-08, "logits/chosen": -3.0179717540740967, "logits/rejected": -2.990093469619751, "logps/chosen": -61.46753692626953, "logps/rejected": -61.537109375, "loss": 0.6807, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.0556570403277874, "rewards/margins": 0.026717105880379677, "rewards/rejected": -0.08237414062023163, "step": 1920 }, { "epoch": 1.330117160578911, "grad_norm": 1.3527724742889404, "learning_rate": 4.221186376665648e-08, "logits/chosen": -3.0378904342651367, "logits/rejected": -3.0053915977478027, "logps/chosen": -59.75395965576172, "logps/rejected": -59.148101806640625, "loss": 0.6822, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.056514717638492584, "rewards/margins": 0.023727912455797195, "rewards/rejected": -0.08024262636899948, "step": 1930 }, { "epoch": 1.3370089593383874, "grad_norm": 1.4073480367660522, "learning_rate": 4.210249347392949e-08, "logits/chosen": -2.941415548324585, "logits/rejected": -2.9191932678222656, "logps/chosen": -58.29315185546875, "logps/rejected": -61.125091552734375, "loss": 0.6797, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.05773955583572388, "rewards/margins": 0.0286725964397192, "rewards/rejected": -0.08641214668750763, "step": 1940 }, { "epoch": 1.3439007580978635, "grad_norm": 1.3412295579910278, "learning_rate": 4.199250442748998e-08, "logits/chosen": -3.0176608562469482, "logits/rejected": -3.0019617080688477, "logps/chosen": -58.65700149536133, "logps/rejected": -61.8019905090332, "loss": 0.683, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06355486065149307, "rewards/margins": 0.02189701236784458, "rewards/rejected": -0.08545185625553131, "step": 1950 }, { "epoch": 1.3507925568573398, "grad_norm": 1.3129335641860962, "learning_rate": 4.188190060664839e-08, "logits/chosen": -2.988128900527954, "logits/rejected": -2.9653995037078857, "logps/chosen": -60.391929626464844, "logps/rejected": -62.654335021972656, "loss": 0.6814, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05704279616475105, "rewards/margins": 0.02517732046544552, "rewards/rejected": -0.08222011476755142, "step": 1960 }, { "epoch": 1.3576843556168159, "grad_norm": 1.4563038349151611, "learning_rate": 4.1770686012957165e-08, "logits/chosen": -2.973057270050049, "logits/rejected": -2.947322130203247, "logps/chosen": -60.245872497558594, "logps/rejected": -60.725914001464844, "loss": 0.6799, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.059306107461452484, "rewards/margins": 0.028189286589622498, "rewards/rejected": -0.08749540150165558, "step": 1970 }, { "epoch": 1.3645761543762922, "grad_norm": 1.3759511709213257, "learning_rate": 4.1658864670066e-08, "logits/chosen": -2.9715092182159424, "logits/rejected": -2.9540700912475586, "logps/chosen": -60.6792106628418, "logps/rejected": -61.75684356689453, "loss": 0.6835, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.061301182955503464, "rewards/margins": 0.0212391410022974, "rewards/rejected": -0.08254033327102661, "step": 1980 }, { "epoch": 1.3714679531357685, "grad_norm": 1.3124091625213623, "learning_rate": 4.154644062357629e-08, "logits/chosen": -2.935107469558716, "logits/rejected": -2.9147658348083496, "logps/chosen": -60.713287353515625, "logps/rejected": -61.98405075073242, "loss": 0.6809, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.05993304401636124, "rewards/margins": 0.026475939899683, "rewards/rejected": -0.08640898764133453, "step": 1990 }, { "epoch": 1.3783597518952446, "grad_norm": 1.339349627494812, "learning_rate": 4.143341794089469e-08, "logits/chosen": -3.0139946937561035, "logits/rejected": -2.997209072113037, "logps/chosen": -60.69626998901367, "logps/rejected": -62.69127655029297, "loss": 0.6824, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.06330845504999161, "rewards/margins": 0.023407040163874626, "rewards/rejected": -0.08671549707651138, "step": 2000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -3.0871129035949707, "eval_logits/rejected": -3.0814309120178223, "eval_logps/chosen": -61.951934814453125, "eval_logps/rejected": -67.53553771972656, "eval_loss": 0.6880571246147156, "eval_rewards/accuracies": 0.5938661694526672, "eval_rewards/chosen": -0.03240031749010086, "eval_rewards/margins": 0.011153885163366795, "eval_rewards/rejected": -0.04355420544743538, "eval_runtime": 383.012, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 2000 }, { "epoch": 1.385251550654721, "grad_norm": 1.471229910850525, "learning_rate": 4.1319800711086036e-08, "logits/chosen": -2.9599812030792236, "logits/rejected": -2.932666301727295, "logps/chosen": -62.118324279785156, "logps/rejected": -62.0120849609375, "loss": 0.6818, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.06376408040523529, "rewards/margins": 0.024828068912029266, "rewards/rejected": -0.08859214186668396, "step": 2010 }, { "epoch": 1.392143349414197, "grad_norm": 1.4093961715698242, "learning_rate": 4.120559304472536e-08, "logits/chosen": -2.9811830520629883, "logits/rejected": -2.9579930305480957, "logps/chosen": -60.16522979736328, "logps/rejected": -62.183387756347656, "loss": 0.6802, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.05942510813474655, "rewards/margins": 0.027757417410612106, "rewards/rejected": -0.08718253672122955, "step": 2020 }, { "epoch": 1.3990351481736734, "grad_norm": 1.3852018117904663, "learning_rate": 4.10907990737492e-08, "logits/chosen": -2.963920831680298, "logits/rejected": -2.940875768661499, "logps/chosen": -60.59895706176758, "logps/rejected": -62.0329475402832, "loss": 0.682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06257908791303635, "rewards/margins": 0.024067074060440063, "rewards/rejected": -0.08664616197347641, "step": 2030 }, { "epoch": 1.4059269469331497, "grad_norm": 1.4350903034210205, "learning_rate": 4.0975422951306095e-08, "logits/chosen": -2.964158773422241, "logits/rejected": -2.9430880546569824, "logps/chosen": -60.4473876953125, "logps/rejected": -61.35686492919922, "loss": 0.6809, "rewards/accuracies": 0.609375, "rewards/chosen": -0.06520421802997589, "rewards/margins": 0.026435157284140587, "rewards/rejected": -0.09163938462734222, "step": 2040 }, { "epoch": 1.4128187456926258, "grad_norm": 1.3208574056625366, "learning_rate": 4.08594688516063e-08, "logits/chosen": -2.9904215335845947, "logits/rejected": -2.964301347732544, "logps/chosen": -63.49229049682617, "logps/rejected": -62.1002311706543, "loss": 0.6816, "rewards/accuracies": 0.609375, "rewards/chosen": -0.062480825930833817, "rewards/margins": 0.02507900632917881, "rewards/rejected": -0.08755983412265778, "step": 2050 }, { "epoch": 1.4197105444521019, "grad_norm": 1.4618719816207886, "learning_rate": 4.0742940969770864e-08, "logits/chosen": -2.9716999530792236, "logits/rejected": -2.9486472606658936, "logps/chosen": -61.4891357421875, "logps/rejected": -62.1390266418457, "loss": 0.6818, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0658407211303711, "rewards/margins": 0.024591678753495216, "rewards/rejected": -0.09043239057064056, "step": 2060 }, { "epoch": 1.4266023432115782, "grad_norm": 1.4130208492279053, "learning_rate": 4.062584352167971e-08, "logits/chosen": -3.0221781730651855, "logits/rejected": -3.0053608417510986, "logps/chosen": -62.087249755859375, "logps/rejected": -63.68877410888672, "loss": 0.6837, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07128091901540756, "rewards/margins": 0.020984884351491928, "rewards/rejected": -0.09226579964160919, "step": 2070 }, { "epoch": 1.4334941419710545, "grad_norm": 1.3860918283462524, "learning_rate": 4.0508180743819255e-08, "logits/chosen": -2.9790875911712646, "logits/rejected": -2.948974370956421, "logps/chosen": -62.810211181640625, "logps/rejected": -62.392189025878906, "loss": 0.6807, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.0623566210269928, "rewards/margins": 0.0267375148832798, "rewards/rejected": -0.0890941396355629, "step": 2080 }, { "epoch": 1.4403859407305306, "grad_norm": 1.4288688898086548, "learning_rate": 4.038995689312901e-08, "logits/chosen": -2.9385793209075928, "logits/rejected": -2.9186973571777344, "logps/chosen": -60.028076171875, "logps/rejected": -62.41402053833008, "loss": 0.6813, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.07332204282283783, "rewards/margins": 0.025766443461179733, "rewards/rejected": -0.09908848255872726, "step": 2090 }, { "epoch": 1.447277739490007, "grad_norm": 1.407326102256775, "learning_rate": 4.027117624684765e-08, "logits/chosen": -3.0112595558166504, "logits/rejected": -2.9882287979125977, "logps/chosen": -59.2943000793457, "logps/rejected": -63.46495819091797, "loss": 0.6799, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0696934312582016, "rewards/margins": 0.028331905603408813, "rewards/rejected": -0.09802533686161041, "step": 2100 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -3.0810837745666504, "eval_logits/rejected": -3.0753631591796875, "eval_logps/chosen": -62.58242416381836, "eval_logps/rejected": -68.28349304199219, "eval_loss": 0.6875383257865906, "eval_rewards/accuracies": 0.5992100238800049, "eval_rewards/chosen": -0.03870531916618347, "eval_rewards/margins": 0.012328363955020905, "eval_rewards/rejected": -0.05103367939591408, "eval_runtime": 383.197, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 2100 }, { "epoch": 1.454169538249483, "grad_norm": 1.4594941139221191, "learning_rate": 4.0151843102358255e-08, "logits/chosen": -2.940186023712158, "logits/rejected": -2.919975757598877, "logps/chosen": -61.47553634643555, "logps/rejected": -62.88214874267578, "loss": 0.6793, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07076480239629745, "rewards/margins": 0.02989714778959751, "rewards/rejected": -0.10066194832324982, "step": 2110 }, { "epoch": 1.4610613370089593, "grad_norm": 1.3778456449508667, "learning_rate": 4.0031961777032796e-08, "logits/chosen": -2.957401752471924, "logits/rejected": -2.934462785720825, "logps/chosen": -59.739593505859375, "logps/rejected": -63.142906188964844, "loss": 0.679, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.07259052246809006, "rewards/margins": 0.030414488166570663, "rewards/rejected": -0.10300501435995102, "step": 2120 }, { "epoch": 1.4679531357684357, "grad_norm": 1.3951976299285889, "learning_rate": 3.991153660807599e-08, "logits/chosen": -2.9554810523986816, "logits/rejected": -2.9320640563964844, "logps/chosen": -62.29570388793945, "logps/rejected": -63.06371307373047, "loss": 0.6785, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.07179206609725952, "rewards/margins": 0.03167170286178589, "rewards/rejected": -0.10346376895904541, "step": 2130 }, { "epoch": 1.4748449345279118, "grad_norm": 1.3991068601608276, "learning_rate": 3.979057195236834e-08, "logits/chosen": -3.0165884494781494, "logits/rejected": -3.000462770462036, "logps/chosen": -61.51566696166992, "logps/rejected": -63.042503356933594, "loss": 0.6827, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.07359077781438828, "rewards/margins": 0.023021550849080086, "rewards/rejected": -0.09661233425140381, "step": 2140 }, { "epoch": 1.481736733287388, "grad_norm": 1.3886750936508179, "learning_rate": 3.9669072186308496e-08, "logits/chosen": -2.980654239654541, "logits/rejected": -2.953601598739624, "logps/chosen": -61.36989212036133, "logps/rejected": -62.92179489135742, "loss": 0.6774, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.06638355553150177, "rewards/margins": 0.03355860710144043, "rewards/rejected": -0.099942147731781, "step": 2150 }, { "epoch": 1.4886285320468642, "grad_norm": 1.4394383430480957, "learning_rate": 3.9547041705655e-08, "logits/chosen": -2.9374194145202637, "logits/rejected": -2.9163620471954346, "logps/chosen": -61.31916427612305, "logps/rejected": -61.365501403808594, "loss": 0.6818, "rewards/accuracies": 0.625, "rewards/chosen": -0.07227741181850433, "rewards/margins": 0.02476491592824459, "rewards/rejected": -0.09704232215881348, "step": 2160 }, { "epoch": 1.4955203308063405, "grad_norm": 1.424153447151184, "learning_rate": 3.942448492536717e-08, "logits/chosen": -2.973271369934082, "logits/rejected": -2.9581730365753174, "logps/chosen": -60.27112579345703, "logps/rejected": -63.340431213378906, "loss": 0.6794, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.07523776590824127, "rewards/margins": 0.029749874025583267, "rewards/rejected": -0.10498764365911484, "step": 2170 }, { "epoch": 1.5024121295658168, "grad_norm": 1.5541982650756836, "learning_rate": 3.930140627944539e-08, "logits/chosen": -2.981715679168701, "logits/rejected": -2.956012010574341, "logps/chosen": -62.48158645629883, "logps/rejected": -61.987754821777344, "loss": 0.6808, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.07177339494228363, "rewards/margins": 0.026721671223640442, "rewards/rejected": -0.09849507361650467, "step": 2180 }, { "epoch": 1.509303928325293, "grad_norm": 1.4756637811660767, "learning_rate": 3.9177810220770714e-08, "logits/chosen": -2.966075897216797, "logits/rejected": -2.946540355682373, "logps/chosen": -59.7988166809082, "logps/rejected": -63.67097091674805, "loss": 0.6803, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.07581230998039246, "rewards/margins": 0.02784324251115322, "rewards/rejected": -0.10365555435419083, "step": 2190 }, { "epoch": 1.516195727084769, "grad_norm": 1.502942681312561, "learning_rate": 3.905370122094375e-08, "logits/chosen": -2.9771804809570312, "logits/rejected": -2.9545178413391113, "logps/chosen": -61.055908203125, "logps/rejected": -62.46620559692383, "loss": 0.6793, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07532913982868195, "rewards/margins": 0.029998067766427994, "rewards/rejected": -0.10532720386981964, "step": 2200 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -3.0755319595336914, "eval_logits/rejected": -3.069783926010132, "eval_logps/chosen": -62.91606521606445, "eval_logps/rejected": -68.69403076171875, "eval_loss": 0.6872024536132812, "eval_rewards/accuracies": 0.5913103818893433, "eval_rewards/chosen": -0.04204174131155014, "eval_rewards/margins": 0.0130973095074296, "eval_rewards/rejected": -0.055139049887657166, "eval_runtime": 383.1499, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 2200 }, { "epoch": 1.5230875258442453, "grad_norm": 1.5146379470825195, "learning_rate": 3.892908377012286e-08, "logits/chosen": -2.9487671852111816, "logits/rejected": -2.9298737049102783, "logps/chosen": -59.63594436645508, "logps/rejected": -62.51985549926758, "loss": 0.6806, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.07553710043430328, "rewards/margins": 0.027343079447746277, "rewards/rejected": -0.10288016498088837, "step": 2210 }, { "epoch": 1.5299793246037217, "grad_norm": 1.4161518812179565, "learning_rate": 3.8803962376861776e-08, "logits/chosen": -3.0237414836883545, "logits/rejected": -2.9984359741210938, "logps/chosen": -61.302101135253906, "logps/rejected": -63.587738037109375, "loss": 0.6804, "rewards/accuracies": 0.609375, "rewards/chosen": -0.07388935983181, "rewards/margins": 0.027665745466947556, "rewards/rejected": -0.10155510902404785, "step": 2220 }, { "epoch": 1.5368711233631978, "grad_norm": 1.4803438186645508, "learning_rate": 3.86783415679464e-08, "logits/chosen": -2.992002010345459, "logits/rejected": -2.9662842750549316, "logps/chosen": -63.27616500854492, "logps/rejected": -63.127784729003906, "loss": 0.6785, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.07750724256038666, "rewards/margins": 0.03143255040049553, "rewards/rejected": -0.10893978923559189, "step": 2230 }, { "epoch": 1.5437629221226739, "grad_norm": 1.4667482376098633, "learning_rate": 3.8552225888231084e-08, "logits/chosen": -2.968416690826416, "logits/rejected": -2.949843406677246, "logps/chosen": -62.52892303466797, "logps/rejected": -64.35491180419922, "loss": 0.6808, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.07821829617023468, "rewards/margins": 0.027173910290002823, "rewards/rejected": -0.1053922176361084, "step": 2240 }, { "epoch": 1.5506547208821502, "grad_norm": 1.402271032333374, "learning_rate": 3.842561990047419e-08, "logits/chosen": -2.9871058464050293, "logits/rejected": -2.9605154991149902, "logps/chosen": -62.616737365722656, "logps/rejected": -63.326171875, "loss": 0.6785, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.0758453756570816, "rewards/margins": 0.03141525015234947, "rewards/rejected": -0.10726062208414078, "step": 2250 }, { "epoch": 1.5575465196416265, "grad_norm": 1.4089879989624023, "learning_rate": 3.829852818517301e-08, "logits/chosen": -2.929703712463379, "logits/rejected": -2.915001392364502, "logps/chosen": -61.0443000793457, "logps/rejected": -63.35173797607422, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": -0.08073240518569946, "rewards/margins": 0.02880897745490074, "rewards/rejected": -0.10954137146472931, "step": 2260 }, { "epoch": 1.5644383184011028, "grad_norm": 1.4021445512771606, "learning_rate": 3.8170955340398024e-08, "logits/chosen": -2.9641337394714355, "logits/rejected": -2.9475598335266113, "logps/chosen": -61.78564453125, "logps/rejected": -64.78318786621094, "loss": 0.6803, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.0803145095705986, "rewards/margins": 0.02784707583487034, "rewards/rejected": -0.10816159099340439, "step": 2270 }, { "epoch": 1.571330117160579, "grad_norm": 1.522268295288086, "learning_rate": 3.804290598162661e-08, "logits/chosen": -2.97495436668396, "logits/rejected": -2.9435651302337646, "logps/chosen": -64.36475372314453, "logps/rejected": -63.8470573425293, "loss": 0.6769, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.07467274367809296, "rewards/margins": 0.034745462238788605, "rewards/rejected": -0.10941819846630096, "step": 2280 }, { "epoch": 1.578221915920055, "grad_norm": 1.3183190822601318, "learning_rate": 3.7914384741575963e-08, "logits/chosen": -2.9750678539276123, "logits/rejected": -2.9480645656585693, "logps/chosen": -62.51726150512695, "logps/rejected": -63.919837951660156, "loss": 0.6763, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.07573171705007553, "rewards/margins": 0.03637672960758209, "rewards/rejected": -0.11210844665765762, "step": 2290 }, { "epoch": 1.5851137146795313, "grad_norm": 1.4644547700881958, "learning_rate": 3.778539627003561e-08, "logits/chosen": -3.015399694442749, "logits/rejected": -2.995026111602783, "logps/chosen": -62.95355987548828, "logps/rejected": -64.27342224121094, "loss": 0.6797, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08808152377605438, "rewards/margins": 0.02921540103852749, "rewards/rejected": -0.11729691922664642, "step": 2300 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -3.068023681640625, "eval_logits/rejected": -3.0622589588165283, "eval_logps/chosen": -63.56266784667969, "eval_logps/rejected": -69.44271087646484, "eval_loss": 0.6867676973342896, "eval_rewards/accuracies": 0.591775119304657, "eval_rewards/chosen": -0.0485076904296875, "eval_rewards/margins": 0.014118240214884281, "eval_rewards/rejected": -0.0626259446144104, "eval_runtime": 383.1715, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 2300 }, { "epoch": 1.5920055134390076, "grad_norm": 1.4287474155426025, "learning_rate": 3.7655945233699046e-08, "logits/chosen": -2.9111435413360596, "logits/rejected": -2.8938465118408203, "logps/chosen": -61.244232177734375, "logps/rejected": -63.44348907470703, "loss": 0.6802, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08414885401725769, "rewards/margins": 0.028316324576735497, "rewards/rejected": -0.11246518045663834, "step": 2310 }, { "epoch": 1.598897312198484, "grad_norm": 1.5261116027832031, "learning_rate": 3.7526036315995024e-08, "logits/chosen": -2.9707908630371094, "logits/rejected": -2.9473297595977783, "logps/chosen": -61.3227653503418, "logps/rejected": -63.56636428833008, "loss": 0.6805, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.08494694530963898, "rewards/margins": 0.027700329199433327, "rewards/rejected": -0.11264727264642715, "step": 2320 }, { "epoch": 1.60578911095796, "grad_norm": 1.5030006170272827, "learning_rate": 3.739567421691803e-08, "logits/chosen": -2.9557173252105713, "logits/rejected": -2.936522960662842, "logps/chosen": -62.1193962097168, "logps/rejected": -64.6266860961914, "loss": 0.68, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.08890379220247269, "rewards/margins": 0.02855587564408779, "rewards/rejected": -0.11745966970920563, "step": 2330 }, { "epoch": 1.6126809097174362, "grad_norm": 1.4393787384033203, "learning_rate": 3.726486365285828e-08, "logits/chosen": -2.9760494232177734, "logits/rejected": -2.9487674236297607, "logps/chosen": -62.930320739746094, "logps/rejected": -64.93696594238281, "loss": 0.6763, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08525412529706955, "rewards/margins": 0.03625521808862686, "rewards/rejected": -0.12150935083627701, "step": 2340 }, { "epoch": 1.6195727084769125, "grad_norm": 1.506498098373413, "learning_rate": 3.713360935643105e-08, "logits/chosen": -2.9618496894836426, "logits/rejected": -2.9410431385040283, "logps/chosen": -62.3760986328125, "logps/rejected": -64.449462890625, "loss": 0.6825, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0852210745215416, "rewards/margins": 0.023671802133321762, "rewards/rejected": -0.10889288038015366, "step": 2350 }, { "epoch": 1.6264645072363888, "grad_norm": 1.5402287244796753, "learning_rate": 3.7001916076305515e-08, "logits/chosen": -2.952976942062378, "logits/rejected": -2.93955659866333, "logps/chosen": -61.91290283203125, "logps/rejected": -63.161766052246094, "loss": 0.6815, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.09168025851249695, "rewards/margins": 0.025649910792708397, "rewards/rejected": -0.1173301711678505, "step": 2360 }, { "epoch": 1.633356305995865, "grad_norm": 1.4769482612609863, "learning_rate": 3.686978857703287e-08, "logits/chosen": -2.933389186859131, "logits/rejected": -2.9141061305999756, "logps/chosen": -62.54018020629883, "logps/rejected": -64.46358489990234, "loss": 0.6793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09470812976360321, "rewards/margins": 0.030456166714429855, "rewards/rejected": -0.12516430020332336, "step": 2370 }, { "epoch": 1.640248104755341, "grad_norm": 1.5358612537384033, "learning_rate": 3.6737231638874e-08, "logits/chosen": -2.964564561843872, "logits/rejected": -2.948787212371826, "logps/chosen": -61.36656951904297, "logps/rejected": -66.63494110107422, "loss": 0.6782, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.08621639758348465, "rewards/margins": 0.03227876499295235, "rewards/rejected": -0.1184951663017273, "step": 2380 }, { "epoch": 1.6471399035148173, "grad_norm": 1.62380850315094, "learning_rate": 3.660425005762656e-08, "logits/chosen": -2.9833335876464844, "logits/rejected": -2.9707391262054443, "logps/chosen": -62.497413635253906, "logps/rejected": -65.65277862548828, "loss": 0.6791, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09032244980335236, "rewards/margins": 0.030845990404486656, "rewards/rejected": -0.12116844952106476, "step": 2390 }, { "epoch": 1.6540317022742936, "grad_norm": 1.4775935411453247, "learning_rate": 3.647084864445137e-08, "logits/chosen": -2.997875452041626, "logits/rejected": -2.9765381813049316, "logps/chosen": -62.72658157348633, "logps/rejected": -65.87232971191406, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": -0.08752314746379852, "rewards/margins": 0.030504804104566574, "rewards/rejected": -0.1180279478430748, "step": 2400 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -3.0604095458984375, "eval_logits/rejected": -3.054663896560669, "eval_logps/chosen": -63.836509704589844, "eval_logps/rejected": -69.81018829345703, "eval_loss": 0.6863463521003723, "eval_rewards/accuracies": 0.5938661694526672, "eval_rewards/chosen": -0.051246125251054764, "eval_rewards/margins": 0.015054534189403057, "eval_rewards/rejected": -0.0663006603717804, "eval_runtime": 383.1349, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 2400 }, { "epoch": 1.66092350103377, "grad_norm": 1.5498945713043213, "learning_rate": 3.633703222569846e-08, "logits/chosen": -3.0100014209747314, "logits/rejected": -2.988426923751831, "logps/chosen": -64.33277893066406, "logps/rejected": -65.0008544921875, "loss": 0.6804, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08745310455560684, "rewards/margins": 0.027745071798563004, "rewards/rejected": -0.11519818007946014, "step": 2410 }, { "epoch": 1.667815299793246, "grad_norm": 1.4976897239685059, "learning_rate": 3.620280564273241e-08, "logits/chosen": -2.9590861797332764, "logits/rejected": -2.9411215782165527, "logps/chosen": -62.91560745239258, "logps/rejected": -64.81109619140625, "loss": 0.6807, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08753637224435806, "rewards/margins": 0.027622852474451065, "rewards/rejected": -0.11515922844409943, "step": 2420 }, { "epoch": 1.6747070985527222, "grad_norm": 1.5151920318603516, "learning_rate": 3.606817375175716e-08, "logits/chosen": -2.8968300819396973, "logits/rejected": -2.876819610595703, "logps/chosen": -62.963401794433594, "logps/rejected": -64.53592681884766, "loss": 0.68, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.09082961827516556, "rewards/margins": 0.02914278581738472, "rewards/rejected": -0.11997239291667938, "step": 2430 }, { "epoch": 1.6815988973121985, "grad_norm": 1.4628092050552368, "learning_rate": 3.5933141423640376e-08, "logits/chosen": -2.974097728729248, "logits/rejected": -2.9492757320404053, "logps/chosen": -62.92531204223633, "logps/rejected": -64.16741180419922, "loss": 0.678, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.08794812858104706, "rewards/margins": 0.0328797921538353, "rewards/rejected": -0.12082792818546295, "step": 2440 }, { "epoch": 1.6884906960716748, "grad_norm": 1.499584674835205, "learning_rate": 3.579771354373721e-08, "logits/chosen": -2.9544262886047363, "logits/rejected": -2.928666591644287, "logps/chosen": -64.64310455322266, "logps/rejected": -66.62565612792969, "loss": 0.6743, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.08341985940933228, "rewards/margins": 0.0407288484275341, "rewards/rejected": -0.12414871156215668, "step": 2450 }, { "epoch": 1.6953824948311509, "grad_norm": 1.454473853111267, "learning_rate": 3.5661895011713494e-08, "logits/chosen": -2.9589247703552246, "logits/rejected": -2.9348716735839844, "logps/chosen": -64.92304992675781, "logps/rejected": -67.33891296386719, "loss": 0.6773, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09071369469165802, "rewards/margins": 0.03458043560385704, "rewards/rejected": -0.12529411911964417, "step": 2460 }, { "epoch": 1.7022742935906272, "grad_norm": 1.4910682439804077, "learning_rate": 3.552569074136858e-08, "logits/chosen": -2.9660167694091797, "logits/rejected": -2.952763319015503, "logps/chosen": -63.223480224609375, "logps/rejected": -64.98076629638672, "loss": 0.6808, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09292306005954742, "rewards/margins": 0.027169913053512573, "rewards/rejected": -0.1200929656624794, "step": 2470 }, { "epoch": 1.7091660923501033, "grad_norm": 1.4306137561798096, "learning_rate": 3.5389105660457474e-08, "logits/chosen": -2.9617037773132324, "logits/rejected": -2.9344968795776367, "logps/chosen": -63.51398849487305, "logps/rejected": -63.80388259887695, "loss": 0.6748, "rewards/accuracies": 0.640625, "rewards/chosen": -0.08918656408786774, "rewards/margins": 0.039714910089969635, "rewards/rejected": -0.12890148162841797, "step": 2480 }, { "epoch": 1.7160578911095796, "grad_norm": 1.5343658924102783, "learning_rate": 3.525214471051258e-08, "logits/chosen": -3.0051958560943604, "logits/rejected": -2.9901459217071533, "logps/chosen": -62.88615798950195, "logps/rejected": -67.19429779052734, "loss": 0.682, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0929371640086174, "rewards/margins": 0.025157054886221886, "rewards/rejected": -0.11809422820806503, "step": 2490 }, { "epoch": 1.722949689869056, "grad_norm": 1.4824457168579102, "learning_rate": 3.511481284666496e-08, "logits/chosen": -2.9762489795684814, "logits/rejected": -2.950627088546753, "logps/chosen": -62.60149002075195, "logps/rejected": -65.52711486816406, "loss": 0.6775, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.0919216051697731, "rewards/margins": 0.03377654403448105, "rewards/rejected": -0.12569814920425415, "step": 2500 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -3.0545873641967773, "eval_logits/rejected": -3.048802614212036, "eval_logps/chosen": -64.23253631591797, "eval_logps/rejected": -70.27998352050781, "eval_loss": 0.6860299706459045, "eval_rewards/accuracies": 0.5945631861686707, "eval_rewards/chosen": -0.05520647391676903, "eval_rewards/margins": 0.01579216681420803, "eval_rewards/rejected": -0.07099863886833191, "eval_runtime": 383.2016, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 2500 }, { "epoch": 1.729841488628532, "grad_norm": 1.5480750799179077, "learning_rate": 3.4977115037464985e-08, "logits/chosen": -2.9983439445495605, "logits/rejected": -2.9733569622039795, "logps/chosen": -63.63037109375, "logps/rejected": -65.0792236328125, "loss": 0.6772, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.09059332311153412, "rewards/margins": 0.034389711916446686, "rewards/rejected": -0.1249830350279808, "step": 2510 }, { "epoch": 1.7367332873880081, "grad_norm": 1.5509904623031616, "learning_rate": 3.483905626470265e-08, "logits/chosen": -2.9320836067199707, "logits/rejected": -2.923464298248291, "logps/chosen": -62.970664978027344, "logps/rejected": -66.09261322021484, "loss": 0.6845, "rewards/accuracies": 0.573437511920929, "rewards/chosen": -0.10045228153467178, "rewards/margins": 0.020003201439976692, "rewards/rejected": -0.12045548856258392, "step": 2520 }, { "epoch": 1.7436250861474845, "grad_norm": 1.5070154666900635, "learning_rate": 3.470064152322728e-08, "logits/chosen": -3.0070574283599854, "logits/rejected": -2.9962799549102783, "logps/chosen": -62.90629196166992, "logps/rejected": -67.08839416503906, "loss": 0.6783, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09492292255163193, "rewards/margins": 0.03227662295103073, "rewards/rejected": -0.12719956040382385, "step": 2530 }, { "epoch": 1.7505168849069608, "grad_norm": 1.5832630395889282, "learning_rate": 3.4561875820766864e-08, "logits/chosen": -2.9705777168273926, "logits/rejected": -2.9463553428649902, "logps/chosen": -64.19832611083984, "logps/rejected": -66.52311706542969, "loss": 0.6756, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.08893951773643494, "rewards/margins": 0.038228485733270645, "rewards/rejected": -0.12716799974441528, "step": 2540 }, { "epoch": 1.757408683666437, "grad_norm": 1.4966561794281006, "learning_rate": 3.442276417774684e-08, "logits/chosen": -2.9463987350463867, "logits/rejected": -2.9258177280426025, "logps/chosen": -61.95404052734375, "logps/rejected": -65.10017395019531, "loss": 0.678, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09271355718374252, "rewards/margins": 0.03316948190331459, "rewards/rejected": -0.12588302791118622, "step": 2550 }, { "epoch": 1.7643004824259132, "grad_norm": 1.5156694650650024, "learning_rate": 3.4283311627108525e-08, "logits/chosen": -2.93735408782959, "logits/rejected": -2.921938419342041, "logps/chosen": -61.85235595703125, "logps/rejected": -66.46633911132812, "loss": 0.678, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10028250515460968, "rewards/margins": 0.03319636732339859, "rewards/rejected": -0.13347885012626648, "step": 2560 }, { "epoch": 1.7711922811853893, "grad_norm": 1.518457055091858, "learning_rate": 3.4143523214126946e-08, "logits/chosen": -2.9486052989959717, "logits/rejected": -2.9321324825286865, "logps/chosen": -64.85343933105469, "logps/rejected": -67.1233139038086, "loss": 0.6774, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.09903400391340256, "rewards/margins": 0.034110672771930695, "rewards/rejected": -0.13314469158649445, "step": 2570 }, { "epoch": 1.7780840799448656, "grad_norm": 1.4665417671203613, "learning_rate": 3.4003403996228354e-08, "logits/chosen": -2.9156951904296875, "logits/rejected": -2.896273136138916, "logps/chosen": -64.50141906738281, "logps/rejected": -67.26720428466797, "loss": 0.6757, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.09724308550357819, "rewards/margins": 0.03806609660387039, "rewards/rejected": -0.13530918955802917, "step": 2580 }, { "epoch": 1.784975878704342, "grad_norm": 1.516481637954712, "learning_rate": 3.386295904280725e-08, "logits/chosen": -2.980966329574585, "logits/rejected": -2.956234931945801, "logps/chosen": -63.95067596435547, "logps/rejected": -66.26925659179688, "loss": 0.6763, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.09385517984628677, "rewards/margins": 0.03647611290216446, "rewards/rejected": -0.13033129274845123, "step": 2590 }, { "epoch": 1.791867677463818, "grad_norm": 1.6482516527175903, "learning_rate": 3.3722193435042965e-08, "logits/chosen": -2.9090423583984375, "logits/rejected": -2.886453628540039, "logps/chosen": -63.03009033203125, "logps/rejected": -67.27535247802734, "loss": 0.6768, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10078646242618561, "rewards/margins": 0.03550305217504501, "rewards/rejected": -0.13628950715065002, "step": 2600 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -3.046949863433838, "eval_logits/rejected": -3.041172981262207, "eval_logps/chosen": -64.68827056884766, "eval_logps/rejected": -70.84428405761719, "eval_loss": 0.6855525374412537, "eval_rewards/accuracies": 0.5936338305473328, "eval_rewards/chosen": -0.05976375564932823, "eval_rewards/margins": 0.016877856105566025, "eval_rewards/rejected": -0.07664161175489426, "eval_runtime": 383.2122, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 2600 }, { "epoch": 1.7987594762232941, "grad_norm": 1.5153844356536865, "learning_rate": 3.358111226571583e-08, "logits/chosen": -2.984407901763916, "logits/rejected": -2.9609453678131104, "logps/chosen": -65.54800415039062, "logps/rejected": -67.45026397705078, "loss": 0.6777, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.09703333675861359, "rewards/margins": 0.03359491750597954, "rewards/rejected": -0.13062825798988342, "step": 2610 }, { "epoch": 1.8056512749827704, "grad_norm": 1.5824655294418335, "learning_rate": 3.3439720639022914e-08, "logits/chosen": -2.9343502521514893, "logits/rejected": -2.911482334136963, "logps/chosen": -63.74268341064453, "logps/rejected": -66.81242370605469, "loss": 0.6764, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09724105894565582, "rewards/margins": 0.036262378096580505, "rewards/rejected": -0.13350343704223633, "step": 2620 }, { "epoch": 1.8125430737422468, "grad_norm": 1.5564674139022827, "learning_rate": 3.32980236703934e-08, "logits/chosen": -2.966848134994507, "logits/rejected": -2.9508607387542725, "logps/chosen": -62.8245964050293, "logps/rejected": -67.39201354980469, "loss": 0.6753, "rewards/accuracies": 0.671875, "rewards/chosen": -0.0967872142791748, "rewards/margins": 0.03868573531508446, "rewards/rejected": -0.13547295331954956, "step": 2630 }, { "epoch": 1.819434872501723, "grad_norm": 1.5874587297439575, "learning_rate": 3.3156026486303463e-08, "logits/chosen": -2.945183753967285, "logits/rejected": -2.928147792816162, "logps/chosen": -64.11351776123047, "logps/rejected": -66.6250991821289, "loss": 0.6782, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.10349839925765991, "rewards/margins": 0.03294640779495239, "rewards/rejected": -0.13644477725028992, "step": 2640 }, { "epoch": 1.8263266712611992, "grad_norm": 1.5243028402328491, "learning_rate": 3.301373422409082e-08, "logits/chosen": -2.987180709838867, "logits/rejected": -2.9657208919525146, "logps/chosen": -64.67195892333984, "logps/rejected": -67.37203979492188, "loss": 0.6763, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.10147979110479355, "rewards/margins": 0.036738764494657516, "rewards/rejected": -0.13821855187416077, "step": 2650 }, { "epoch": 1.8332184700206753, "grad_norm": 1.4651219844818115, "learning_rate": 3.287115203176887e-08, "logits/chosen": -2.924832344055176, "logits/rejected": -2.9061272144317627, "logps/chosen": -63.88309860229492, "logps/rejected": -66.52104949951172, "loss": 0.6774, "rewards/accuracies": 0.640625, "rewards/chosen": -0.10335004329681396, "rewards/margins": 0.03450380638241768, "rewards/rejected": -0.13785384595394135, "step": 2660 }, { "epoch": 1.8401102687801516, "grad_norm": 1.5452479124069214, "learning_rate": 3.2728285067840426e-08, "logits/chosen": -2.9556257724761963, "logits/rejected": -2.9312150478363037, "logps/chosen": -64.9068603515625, "logps/rejected": -67.5003890991211, "loss": 0.675, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.10720133781433105, "rewards/margins": 0.03944636136293411, "rewards/rejected": -0.14664770662784576, "step": 2670 }, { "epoch": 1.847002067539628, "grad_norm": 1.5371242761611938, "learning_rate": 3.258513850111112e-08, "logits/chosen": -2.9560184478759766, "logits/rejected": -2.929164409637451, "logps/chosen": -64.38623046875, "logps/rejected": -66.03816223144531, "loss": 0.677, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1039893850684166, "rewards/margins": 0.03530562296509743, "rewards/rejected": -0.13929501175880432, "step": 2680 }, { "epoch": 1.853893866299104, "grad_norm": 1.5125459432601929, "learning_rate": 3.244171751050235e-08, "logits/chosen": -2.9395947456359863, "logits/rejected": -2.91560697555542, "logps/chosen": -63.99969482421875, "logps/rejected": -66.66188049316406, "loss": 0.6764, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10402562469244003, "rewards/margins": 0.036394815891981125, "rewards/rejected": -0.14042045176029205, "step": 2690 }, { "epoch": 1.8607856650585803, "grad_norm": 1.5807576179504395, "learning_rate": 3.229802728486395e-08, "logits/chosen": -3.0009827613830566, "logits/rejected": -2.9793953895568848, "logps/chosen": -63.598472595214844, "logps/rejected": -68.16215515136719, "loss": 0.675, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.09963031113147736, "rewards/margins": 0.039490558207035065, "rewards/rejected": -0.1391208916902542, "step": 2700 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -3.040238857269287, "eval_logits/rejected": -3.034477472305298, "eval_logps/chosen": -65.2470703125, "eval_logps/rejected": -71.49956512451172, "eval_loss": 0.6851480603218079, "eval_rewards/accuracies": 0.5947955250740051, "eval_rewards/chosen": -0.06535184383392334, "eval_rewards/margins": 0.017842592671513557, "eval_rewards/rejected": -0.08319443464279175, "eval_runtime": 383.5697, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 2700 }, { "epoch": 1.8676774638180564, "grad_norm": 1.4955403804779053, "learning_rate": 3.215407302278644e-08, "logits/chosen": -2.942279100418091, "logits/rejected": -2.911083698272705, "logps/chosen": -63.947593688964844, "logps/rejected": -65.35000610351562, "loss": 0.6758, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.10127153247594833, "rewards/margins": 0.037911005318164825, "rewards/rejected": -0.13918253779411316, "step": 2710 }, { "epoch": 1.8745692625775328, "grad_norm": 1.5726432800292969, "learning_rate": 3.200985993241298e-08, "logits/chosen": -2.9280877113342285, "logits/rejected": -2.909559726715088, "logps/chosen": -62.73002243041992, "logps/rejected": -68.0972671508789, "loss": 0.6747, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10201416164636612, "rewards/margins": 0.03989625722169876, "rewards/rejected": -0.14191041886806488, "step": 2720 }, { "epoch": 1.881461061337009, "grad_norm": 1.644063115119934, "learning_rate": 3.1865393231250884e-08, "logits/chosen": -2.972275733947754, "logits/rejected": -2.945162296295166, "logps/chosen": -65.08409118652344, "logps/rejected": -67.3410873413086, "loss": 0.6744, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.1016988605260849, "rewards/margins": 0.040388740599155426, "rewards/rejected": -0.14208757877349854, "step": 2730 }, { "epoch": 1.8883528600964852, "grad_norm": 1.6946022510528564, "learning_rate": 3.172067814598291e-08, "logits/chosen": -2.8961968421936035, "logits/rejected": -2.8743557929992676, "logps/chosen": -65.72113037109375, "logps/rejected": -67.9848403930664, "loss": 0.6769, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.10949108749628067, "rewards/margins": 0.035538312047719955, "rewards/rejected": -0.14502939581871033, "step": 2740 }, { "epoch": 1.8952446588559613, "grad_norm": 1.85281240940094, "learning_rate": 3.1575719912278146e-08, "logits/chosen": -2.9746181964874268, "logits/rejected": -2.9596455097198486, "logps/chosen": -65.26689147949219, "logps/rejected": -69.38256072998047, "loss": 0.678, "rewards/accuracies": 0.609375, "rewards/chosen": -0.11063704639673233, "rewards/margins": 0.033764228224754333, "rewards/rejected": -0.14440126717090607, "step": 2750 }, { "epoch": 1.9021364576154376, "grad_norm": 1.6351550817489624, "learning_rate": 3.143052377460257e-08, "logits/chosen": -2.9982078075408936, "logits/rejected": -2.979795455932617, "logps/chosen": -65.30284118652344, "logps/rejected": -68.33954620361328, "loss": 0.6735, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10436513274908066, "rewards/margins": 0.042677223682403564, "rewards/rejected": -0.14704236388206482, "step": 2760 }, { "epoch": 1.909028256374914, "grad_norm": 1.581804633140564, "learning_rate": 3.128509498602933e-08, "logits/chosen": -2.9717493057250977, "logits/rejected": -2.9375712871551514, "logps/chosen": -64.01644897460938, "logps/rejected": -67.28237915039062, "loss": 0.6712, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10286970436573029, "rewards/margins": 0.047297943383455276, "rewards/rejected": -0.15016765892505646, "step": 2770 }, { "epoch": 1.9159200551343902, "grad_norm": 1.527143955230713, "learning_rate": 3.113943880804867e-08, "logits/chosen": -2.957034111022949, "logits/rejected": -2.9346654415130615, "logps/chosen": -66.92247009277344, "logps/rejected": -68.73655700683594, "loss": 0.6756, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.1139293909072876, "rewards/margins": 0.03857202082872391, "rewards/rejected": -0.1525014042854309, "step": 2780 }, { "epoch": 1.9228118538938663, "grad_norm": 1.5491613149642944, "learning_rate": 3.0993560510377636e-08, "logits/chosen": -2.936117172241211, "logits/rejected": -2.917527198791504, "logps/chosen": -66.13529968261719, "logps/rejected": -67.19880676269531, "loss": 0.6799, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.11159876734018326, "rewards/margins": 0.0296674482524395, "rewards/rejected": -0.14126621186733246, "step": 2790 }, { "epoch": 1.9297036526533424, "grad_norm": 1.687666893005371, "learning_rate": 3.084746537076932e-08, "logits/chosen": -2.9614017009735107, "logits/rejected": -2.9355201721191406, "logps/chosen": -65.43904113769531, "logps/rejected": -66.14453125, "loss": 0.6736, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.10608307272195816, "rewards/margins": 0.042284585535526276, "rewards/rejected": -0.14836764335632324, "step": 2800 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -3.0343620777130127, "eval_logits/rejected": -3.0285613536834717, "eval_logps/chosen": -65.7863540649414, "eval_logps/rejected": -72.14484405517578, "eval_loss": 0.6846997737884521, "eval_rewards/accuracies": 0.598280668258667, "eval_rewards/chosen": -0.07074455171823502, "eval_rewards/margins": 0.018902689218521118, "eval_rewards/rejected": -0.08964724093675613, "eval_runtime": 383.8232, "eval_samples_per_second": 11.213, "eval_steps_per_second": 1.402, "step": 2800 }, { "epoch": 1.9365954514128187, "grad_norm": 1.5789588689804077, "learning_rate": 3.070115867482202e-08, "logits/chosen": -2.9059557914733887, "logits/rejected": -2.8913381099700928, "logps/chosen": -64.81111145019531, "logps/rejected": -68.49555206298828, "loss": 0.6786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11591235548257828, "rewards/margins": 0.03254670649766922, "rewards/rejected": -0.1484590470790863, "step": 2810 }, { "epoch": 1.943487250172295, "grad_norm": 1.543212652206421, "learning_rate": 3.0554645715787926e-08, "logits/chosen": -2.9375526905059814, "logits/rejected": -2.907182216644287, "logps/chosen": -66.82940673828125, "logps/rejected": -67.15513610839844, "loss": 0.6739, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.10829315334558487, "rewards/margins": 0.0416647270321846, "rewards/rejected": -0.14995788037776947, "step": 2820 }, { "epoch": 1.9503790489317712, "grad_norm": 1.6493370532989502, "learning_rate": 3.040793179438167e-08, "logits/chosen": -2.9627556800842285, "logits/rejected": -2.943434715270996, "logps/chosen": -66.67835998535156, "logps/rejected": -68.80840301513672, "loss": 0.6731, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.10611464083194733, "rewards/margins": 0.04375798627734184, "rewards/rejected": -0.14987263083457947, "step": 2830 }, { "epoch": 1.9572708476912473, "grad_norm": 1.5974655151367188, "learning_rate": 3.026102221858853e-08, "logits/chosen": -2.9443295001983643, "logits/rejected": -2.920044183731079, "logps/chosen": -66.07645416259766, "logps/rejected": -67.7667465209961, "loss": 0.6744, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1180158257484436, "rewards/margins": 0.04090026021003723, "rewards/rejected": -0.15891608595848083, "step": 2840 }, { "epoch": 1.9641626464507236, "grad_norm": 1.6743168830871582, "learning_rate": 3.0113922303472386e-08, "logits/chosen": -2.945852756500244, "logits/rejected": -2.922750949859619, "logps/chosen": -65.21817016601562, "logps/rejected": -68.2879867553711, "loss": 0.6745, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.11131735146045685, "rewards/margins": 0.04075629264116287, "rewards/rejected": -0.15207365155220032, "step": 2850 }, { "epoch": 1.9710544452102, "grad_norm": 1.5774295330047607, "learning_rate": 2.9966637370983444e-08, "logits/chosen": -2.9334816932678223, "logits/rejected": -2.908994197845459, "logps/chosen": -65.4717788696289, "logps/rejected": -67.16613006591797, "loss": 0.6785, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.11248987913131714, "rewards/margins": 0.03265548497438431, "rewards/rejected": -0.14514537155628204, "step": 2860 }, { "epoch": 1.9779462439696762, "grad_norm": 1.79238760471344, "learning_rate": 2.981917274976568e-08, "logits/chosen": -2.93747615814209, "logits/rejected": -2.9109816551208496, "logps/chosen": -66.53807067871094, "logps/rejected": -67.7362289428711, "loss": 0.6719, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.11242332309484482, "rewards/margins": 0.046326812356710434, "rewards/rejected": -0.15875014662742615, "step": 2870 }, { "epoch": 1.9848380427291523, "grad_norm": 1.6473580598831177, "learning_rate": 2.967153377496405e-08, "logits/chosen": -2.935229539871216, "logits/rejected": -2.906409502029419, "logps/chosen": -66.23171997070312, "logps/rejected": -66.33332824707031, "loss": 0.6756, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.11167053878307343, "rewards/margins": 0.038782913237810135, "rewards/rejected": -0.15045344829559326, "step": 2880 }, { "epoch": 1.9917298414886284, "grad_norm": 1.6364060640335083, "learning_rate": 2.9523725788031473e-08, "logits/chosen": -2.945809841156006, "logits/rejected": -2.921739101409912, "logps/chosen": -64.05226135253906, "logps/rejected": -67.63278198242188, "loss": 0.6748, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1142185777425766, "rewards/margins": 0.03999228775501251, "rewards/rejected": -0.1542108654975891, "step": 2890 }, { "epoch": 1.9986216402481047, "grad_norm": 1.6330269575119019, "learning_rate": 2.9375754136535602e-08, "logits/chosen": -2.927903652191162, "logits/rejected": -2.905238628387451, "logps/chosen": -66.85380554199219, "logps/rejected": -69.30448150634766, "loss": 0.6773, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.11773806810379028, "rewards/margins": 0.034860290586948395, "rewards/rejected": -0.15259835124015808, "step": 2900 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -3.028317928314209, "eval_logits/rejected": -3.0225462913513184, "eval_logps/chosen": -66.17581176757812, "eval_logps/rejected": -72.60523223876953, "eval_loss": 0.6843954920768738, "eval_rewards/accuracies": 0.6019981503486633, "eval_rewards/chosen": -0.074639230966568, "eval_rewards/margins": 0.019611874595284462, "eval_rewards/rejected": -0.0942511036992073, "eval_runtime": 384.2768, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 2900 }, { "epoch": 2.005513439007581, "grad_norm": 1.6566567420959473, "learning_rate": 2.922762417396531e-08, "logits/chosen": -2.947601079940796, "logits/rejected": -2.925368309020996, "logps/chosen": -66.07593536376953, "logps/rejected": -68.32076263427734, "loss": 0.6744, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.11480854451656342, "rewards/margins": 0.04105811566114426, "rewards/rejected": -0.15586665272712708, "step": 2910 }, { "epoch": 2.0124052377670574, "grad_norm": 1.617571234703064, "learning_rate": 2.9079341259537044e-08, "logits/chosen": -2.916175127029419, "logits/rejected": -2.8837311267852783, "logps/chosen": -66.15825653076172, "logps/rejected": -69.4295883178711, "loss": 0.6674, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -0.10801906883716583, "rewards/margins": 0.055340588092803955, "rewards/rejected": -0.1633596420288086, "step": 2920 }, { "epoch": 2.0192970365265333, "grad_norm": 1.6272176504135132, "learning_rate": 2.893091075800092e-08, "logits/chosen": -2.9861085414886475, "logits/rejected": -2.959932565689087, "logps/chosen": -67.62165832519531, "logps/rejected": -69.6769790649414, "loss": 0.6737, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11099998652935028, "rewards/margins": 0.04300179332494736, "rewards/rejected": -0.15400178730487823, "step": 2930 }, { "epoch": 2.0261888352860096, "grad_norm": 1.7384891510009766, "learning_rate": 2.878233803944663e-08, "logits/chosen": -2.9693353176116943, "logits/rejected": -2.9573099613189697, "logps/chosen": -64.5928955078125, "logps/rejected": -70.68854522705078, "loss": 0.6749, "rewards/accuracies": 0.640625, "rewards/chosen": -0.11795774847269058, "rewards/margins": 0.04001060128211975, "rewards/rejected": -0.15796832740306854, "step": 2940 }, { "epoch": 2.033080634045486, "grad_norm": 1.5857065916061401, "learning_rate": 2.863362847910914e-08, "logits/chosen": -2.9030938148498535, "logits/rejected": -2.887363910675049, "logps/chosen": -65.62887573242188, "logps/rejected": -68.7466049194336, "loss": 0.6763, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1221378818154335, "rewards/margins": 0.03715101629495621, "rewards/rejected": -0.1592888981103897, "step": 2950 }, { "epoch": 2.039972432804962, "grad_norm": 1.5693594217300415, "learning_rate": 2.8484787457174276e-08, "logits/chosen": -2.9198808670043945, "logits/rejected": -2.905330181121826, "logps/chosen": -64.46752166748047, "logps/rejected": -68.55146789550781, "loss": 0.6765, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.12184862792491913, "rewards/margins": 0.03653833642601967, "rewards/rejected": -0.1583869606256485, "step": 2960 }, { "epoch": 2.0468642315644385, "grad_norm": 1.6655333042144775, "learning_rate": 2.833582035858399e-08, "logits/chosen": -2.903892993927002, "logits/rejected": -2.881399154663086, "logps/chosen": -66.0007095336914, "logps/rejected": -70.13135528564453, "loss": 0.6748, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.12246942520141602, "rewards/margins": 0.04046143963932991, "rewards/rejected": -0.16293087601661682, "step": 2970 }, { "epoch": 2.0537560303239144, "grad_norm": 1.5976343154907227, "learning_rate": 2.81867325728416e-08, "logits/chosen": -2.9313395023345947, "logits/rejected": -2.91597843170166, "logps/chosen": -66.84949493408203, "logps/rejected": -70.85432434082031, "loss": 0.6746, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.11942688375711441, "rewards/margins": 0.040991269052028656, "rewards/rejected": -0.16041815280914307, "step": 2980 }, { "epoch": 2.0606478290833907, "grad_norm": 1.6244357824325562, "learning_rate": 2.8037529493816785e-08, "logits/chosen": -2.9383299350738525, "logits/rejected": -2.9191508293151855, "logps/chosen": -66.07011413574219, "logps/rejected": -69.84043884277344, "loss": 0.6776, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.11829303205013275, "rewards/margins": 0.03461321443319321, "rewards/rejected": -0.15290623903274536, "step": 2990 }, { "epoch": 2.067539627842867, "grad_norm": 1.6982609033584595, "learning_rate": 2.788821651955044e-08, "logits/chosen": -2.950126886367798, "logits/rejected": -2.9213309288024902, "logps/chosen": -66.62999725341797, "logps/rejected": -69.92547607421875, "loss": 0.6724, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.11812355369329453, "rewards/margins": 0.045262694358825684, "rewards/rejected": -0.1633862555027008, "step": 3000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -3.0216457843780518, "eval_logits/rejected": -3.015843629837036, "eval_logps/chosen": -66.6414794921875, "eval_logps/rejected": -73.14647674560547, "eval_loss": 0.6840819716453552, "eval_rewards/accuracies": 0.6029275059700012, "eval_rewards/chosen": -0.07929594814777374, "eval_rewards/margins": 0.020367641001939774, "eval_rewards/rejected": -0.09966358542442322, "eval_runtime": 383.9065, "eval_samples_per_second": 11.211, "eval_steps_per_second": 1.401, "step": 3000 }, { "epoch": 2.0744314266023434, "grad_norm": 1.6537762880325317, "learning_rate": 2.773879905205936e-08, "logits/chosen": -2.9622812271118164, "logits/rejected": -2.9415974617004395, "logps/chosen": -64.69695281982422, "logps/rejected": -68.37895202636719, "loss": 0.6747, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.11607213318347931, "rewards/margins": 0.04073134809732437, "rewards/rejected": -0.15680348873138428, "step": 3010 }, { "epoch": 2.0813232253618192, "grad_norm": 1.6240638494491577, "learning_rate": 2.7589282497140826e-08, "logits/chosen": -2.9141321182250977, "logits/rejected": -2.8829143047332764, "logps/chosen": -66.66893005371094, "logps/rejected": -68.76286315917969, "loss": 0.6715, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.11584749072790146, "rewards/margins": 0.04733988642692566, "rewards/rejected": -0.16318736970424652, "step": 3020 }, { "epoch": 2.0882150241212956, "grad_norm": 1.5916928052902222, "learning_rate": 2.7439672264177017e-08, "logits/chosen": -2.929896593093872, "logits/rejected": -2.903773307800293, "logps/chosen": -68.9540786743164, "logps/rejected": -68.96758270263672, "loss": 0.6743, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.12060385942459106, "rewards/margins": 0.041497115045785904, "rewards/rejected": -0.16210097074508667, "step": 3030 }, { "epoch": 2.095106822880772, "grad_norm": 1.7152595520019531, "learning_rate": 2.7289973765939316e-08, "logits/chosen": -2.902541399002075, "logits/rejected": -2.881728410720825, "logps/chosen": -65.79480743408203, "logps/rejected": -69.61241149902344, "loss": 0.6742, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.11849594116210938, "rewards/margins": 0.04139474406838417, "rewards/rejected": -0.15989068150520325, "step": 3040 }, { "epoch": 2.101998621640248, "grad_norm": 1.6980488300323486, "learning_rate": 2.7140192418392456e-08, "logits/chosen": -2.8960652351379395, "logits/rejected": -2.8790650367736816, "logps/chosen": -66.33906555175781, "logps/rejected": -72.09800720214844, "loss": 0.6726, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12136466801166534, "rewards/margins": 0.045165225863456726, "rewards/rejected": -0.16652990877628326, "step": 3050 }, { "epoch": 2.1088904203997245, "grad_norm": 1.5801739692687988, "learning_rate": 2.699033364049858e-08, "logits/chosen": -2.8709168434143066, "logits/rejected": -2.8511486053466797, "logps/chosen": -65.98777770996094, "logps/rejected": -69.43630981445312, "loss": 0.6752, "rewards/accuracies": 0.640625, "rewards/chosen": -0.13038821518421173, "rewards/margins": 0.039398133754730225, "rewards/rejected": -0.16978636384010315, "step": 3060 }, { "epoch": 2.1157822191592004, "grad_norm": 1.9615129232406616, "learning_rate": 2.684040285402122e-08, "logits/chosen": -2.915796995162964, "logits/rejected": -2.8932700157165527, "logps/chosen": -66.6607894897461, "logps/rejected": -70.65106201171875, "loss": 0.6734, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.12578940391540527, "rewards/margins": 0.04341939836740494, "rewards/rejected": -0.16920879483222961, "step": 3070 }, { "epoch": 2.1226740179186767, "grad_norm": 1.8146353960037231, "learning_rate": 2.6690405483329103e-08, "logits/chosen": -2.8971009254455566, "logits/rejected": -2.873009204864502, "logps/chosen": -66.25140380859375, "logps/rejected": -69.65704345703125, "loss": 0.6715, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.11836614459753036, "rewards/margins": 0.04702942818403244, "rewards/rejected": -0.1653955727815628, "step": 3080 }, { "epoch": 2.129565816678153, "grad_norm": 1.7677499055862427, "learning_rate": 2.6540346955199894e-08, "logits/chosen": -2.908151388168335, "logits/rejected": -2.8839855194091797, "logps/chosen": -67.39698791503906, "logps/rejected": -69.55915832519531, "loss": 0.6741, "rewards/accuracies": 0.625, "rewards/chosen": -0.1253410279750824, "rewards/margins": 0.04227134585380554, "rewards/rejected": -0.16761238873004913, "step": 3090 }, { "epoch": 2.1364576154376294, "grad_norm": 1.8928585052490234, "learning_rate": 2.6390232698623925e-08, "logits/chosen": -2.944854259490967, "logits/rejected": -2.9351649284362793, "logps/chosen": -65.40177917480469, "logps/rejected": -71.36431884765625, "loss": 0.674, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.1255422830581665, "rewards/margins": 0.04232709854841232, "rewards/rejected": -0.16786937415599823, "step": 3100 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -3.01694917678833, "eval_logits/rejected": -3.011165142059326, "eval_logps/chosen": -66.9539566040039, "eval_logps/rejected": -73.53813171386719, "eval_loss": 0.6837360262870789, "eval_rewards/accuracies": 0.6029275059700012, "eval_rewards/chosen": -0.0824206992983818, "eval_rewards/margins": 0.02115933783352375, "eval_rewards/rejected": -0.103580042719841, "eval_runtime": 384.5551, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 3100 }, { "epoch": 2.1433494141971057, "grad_norm": 1.6396089792251587, "learning_rate": 2.624006814460772e-08, "logits/chosen": -2.8522231578826904, "logits/rejected": -2.8298919200897217, "logps/chosen": -66.54461669921875, "logps/rejected": -69.71440887451172, "loss": 0.6728, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12153740227222443, "rewards/margins": 0.044904835522174835, "rewards/rejected": -0.16644224524497986, "step": 3110 }, { "epoch": 2.1502412129565815, "grad_norm": 1.6945747137069702, "learning_rate": 2.608985872597749e-08, "logits/chosen": -2.96403169631958, "logits/rejected": -2.9450039863586426, "logps/chosen": -66.55967712402344, "logps/rejected": -71.02167510986328, "loss": 0.6721, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12850666046142578, "rewards/margins": 0.046185724437236786, "rewards/rejected": -0.17469239234924316, "step": 3120 }, { "epoch": 2.157133011716058, "grad_norm": 1.7662471532821655, "learning_rate": 2.5939609877182672e-08, "logits/chosen": -2.913055896759033, "logits/rejected": -2.890643835067749, "logps/chosen": -66.29566955566406, "logps/rejected": -71.18801879882812, "loss": 0.6705, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.12703628838062286, "rewards/margins": 0.0493118092417717, "rewards/rejected": -0.17634810507297516, "step": 3130 }, { "epoch": 2.164024810475534, "grad_norm": 1.7463346719741821, "learning_rate": 2.5789327034099196e-08, "logits/chosen": -2.954117774963379, "logits/rejected": -2.932440757751465, "logps/chosen": -66.779052734375, "logps/rejected": -69.97312927246094, "loss": 0.6727, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.1267065554857254, "rewards/margins": 0.044935114681720734, "rewards/rejected": -0.17164166271686554, "step": 3140 }, { "epoch": 2.1709166092350105, "grad_norm": 1.7622534036636353, "learning_rate": 2.5639015633832895e-08, "logits/chosen": -2.9070656299591064, "logits/rejected": -2.874222993850708, "logps/chosen": -68.94412231445312, "logps/rejected": -69.67718505859375, "loss": 0.6704, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12387894093990326, "rewards/margins": 0.04964732006192207, "rewards/rejected": -0.17352625727653503, "step": 3150 }, { "epoch": 2.1778084079944864, "grad_norm": 1.815160870552063, "learning_rate": 2.548868111452281e-08, "logits/chosen": -2.9767816066741943, "logits/rejected": -2.9645321369171143, "logps/chosen": -66.39973449707031, "logps/rejected": -70.70191955566406, "loss": 0.6735, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12922878563404083, "rewards/margins": 0.043558526784181595, "rewards/rejected": -0.17278730869293213, "step": 3160 }, { "epoch": 2.1847002067539627, "grad_norm": 1.5723103284835815, "learning_rate": 2.5338328915144336e-08, "logits/chosen": -2.928403377532959, "logits/rejected": -2.9034788608551025, "logps/chosen": -65.00230407714844, "logps/rejected": -68.98863220214844, "loss": 0.6732, "rewards/accuracies": 0.640625, "rewards/chosen": -0.125395730137825, "rewards/margins": 0.04399869218468666, "rewards/rejected": -0.16939441859722137, "step": 3170 }, { "epoch": 2.191592005513439, "grad_norm": 1.6258127689361572, "learning_rate": 2.5187964475312597e-08, "logits/chosen": -2.929079055786133, "logits/rejected": -2.909407377243042, "logps/chosen": -67.30826568603516, "logps/rejected": -71.10563659667969, "loss": 0.6735, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.1259099245071411, "rewards/margins": 0.043546341359615326, "rewards/rejected": -0.16945627331733704, "step": 3180 }, { "epoch": 2.1984838042729153, "grad_norm": 1.6828006505966187, "learning_rate": 2.503759323508552e-08, "logits/chosen": -2.881176471710205, "logits/rejected": -2.8579916954040527, "logps/chosen": -64.82734680175781, "logps/rejected": -69.67784118652344, "loss": 0.6727, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.126509889960289, "rewards/margins": 0.04497005417943001, "rewards/rejected": -0.1714799404144287, "step": 3190 }, { "epoch": 2.205375603032391, "grad_norm": 1.6321678161621094, "learning_rate": 2.4887220634767067e-08, "logits/chosen": -2.9224982261657715, "logits/rejected": -2.9001450538635254, "logps/chosen": -68.78169250488281, "logps/rejected": -71.27153015136719, "loss": 0.6764, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.13695082068443298, "rewards/margins": 0.03738374635577202, "rewards/rejected": -0.1743346005678177, "step": 3200 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -3.010462760925293, "eval_logits/rejected": -3.0046679973602295, "eval_logps/chosen": -67.28557586669922, "eval_logps/rejected": -73.93904876708984, "eval_loss": 0.6834473609924316, "eval_rewards/accuracies": 0.6066449880599976, "eval_rewards/chosen": -0.08573678135871887, "eval_rewards/margins": 0.02185242623090744, "eval_rewards/rejected": -0.10758921504020691, "eval_runtime": 383.4226, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 3200 }, { "epoch": 2.2122674017918675, "grad_norm": 1.8959060907363892, "learning_rate": 2.4736852114710417e-08, "logits/chosen": -2.906376361846924, "logits/rejected": -2.892958641052246, "logps/chosen": -65.91130065917969, "logps/rejected": -71.43583679199219, "loss": 0.6769, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.13590016961097717, "rewards/margins": 0.03660494461655617, "rewards/rejected": -0.17250511050224304, "step": 3210 }, { "epoch": 2.219159200551344, "grad_norm": 1.588124394416809, "learning_rate": 2.458649311512114e-08, "logits/chosen": -2.9102871417999268, "logits/rejected": -2.891556739807129, "logps/chosen": -66.97811126708984, "logps/rejected": -70.01335144042969, "loss": 0.6728, "rewards/accuracies": 0.625, "rewards/chosen": -0.13257083296775818, "rewards/margins": 0.04481132701039314, "rewards/rejected": -0.17738215625286102, "step": 3220 }, { "epoch": 2.22605099931082, "grad_norm": 1.7511128187179565, "learning_rate": 2.443614907586034e-08, "logits/chosen": -2.946758508682251, "logits/rejected": -2.929172992706299, "logps/chosen": -66.30904388427734, "logps/rejected": -72.10446166992188, "loss": 0.67, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.12789154052734375, "rewards/margins": 0.050627827644348145, "rewards/rejected": -0.1785193830728531, "step": 3230 }, { "epoch": 2.2329427980702965, "grad_norm": 1.7126487493515015, "learning_rate": 2.4285825436247875e-08, "logits/chosen": -2.8866331577301025, "logits/rejected": -2.8642492294311523, "logps/chosen": -66.24880981445312, "logps/rejected": -69.57875061035156, "loss": 0.6722, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.1270383596420288, "rewards/margins": 0.04567421227693558, "rewards/rejected": -0.17271257936954498, "step": 3240 }, { "epoch": 2.2398345968297724, "grad_norm": 1.7465450763702393, "learning_rate": 2.413552763486558e-08, "logits/chosen": -2.889232873916626, "logits/rejected": -2.866527795791626, "logps/chosen": -67.50992584228516, "logps/rejected": -70.2740707397461, "loss": 0.6705, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.13593466579914093, "rewards/margins": 0.04922456666827202, "rewards/rejected": -0.18515923619270325, "step": 3250 }, { "epoch": 2.2467263955892487, "grad_norm": 1.8346824645996094, "learning_rate": 2.3985261109360457e-08, "logits/chosen": -2.917980670928955, "logits/rejected": -2.8951916694641113, "logps/chosen": -65.9865493774414, "logps/rejected": -70.16229248046875, "loss": 0.6715, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.13435958325862885, "rewards/margins": 0.047860510647296906, "rewards/rejected": -0.18222011625766754, "step": 3260 }, { "epoch": 2.253618194348725, "grad_norm": 1.72318696975708, "learning_rate": 2.3835031296247988e-08, "logits/chosen": -2.904421806335449, "logits/rejected": -2.8919169902801514, "logps/chosen": -66.15772247314453, "logps/rejected": -70.02629852294922, "loss": 0.6769, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.12967680394649506, "rewards/margins": 0.036128487437963486, "rewards/rejected": -0.16580531001091003, "step": 3270 }, { "epoch": 2.2605099931082013, "grad_norm": 1.706644058227539, "learning_rate": 2.3684843630715446e-08, "logits/chosen": -2.9575226306915283, "logits/rejected": -2.9317879676818848, "logps/chosen": -67.58252716064453, "logps/rejected": -69.60554504394531, "loss": 0.6734, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.13426518440246582, "rewards/margins": 0.043473247438669205, "rewards/rejected": -0.17773844301700592, "step": 3280 }, { "epoch": 2.2674017918676777, "grad_norm": 1.7198386192321777, "learning_rate": 2.3534703546425203e-08, "logits/chosen": -2.8975813388824463, "logits/rejected": -2.874642848968506, "logps/chosen": -67.80191802978516, "logps/rejected": -73.37427520751953, "loss": 0.6686, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12964396178722382, "rewards/margins": 0.05359124019742012, "rewards/rejected": -0.18323521316051483, "step": 3290 }, { "epoch": 2.2742935906271535, "grad_norm": 1.7592954635620117, "learning_rate": 2.338461647531821e-08, "logits/chosen": -2.9084677696228027, "logits/rejected": -2.8876278400421143, "logps/chosen": -67.4603271484375, "logps/rejected": -69.23651885986328, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": -0.13029661774635315, "rewards/margins": 0.04010510444641113, "rewards/rejected": -0.17040172219276428, "step": 3300 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -3.0049281120300293, "eval_logits/rejected": -2.999133825302124, "eval_logps/chosen": -67.58458709716797, "eval_logps/rejected": -74.31029510498047, "eval_loss": 0.6831346750259399, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.0887269377708435, "eval_rewards/margins": 0.02257475070655346, "eval_rewards/rejected": -0.11130168288946152, "eval_runtime": 383.2655, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 3300 }, { "epoch": 2.28118538938663, "grad_norm": 1.6773444414138794, "learning_rate": 2.3234587847417447e-08, "logits/chosen": -2.920926570892334, "logits/rejected": -2.9043517112731934, "logps/chosen": -65.21070861816406, "logps/rejected": -70.92481231689453, "loss": 0.674, "rewards/accuracies": 0.640625, "rewards/chosen": -0.12870939075946808, "rewards/margins": 0.04243508726358414, "rewards/rejected": -0.17114447057247162, "step": 3310 }, { "epoch": 2.288077188146106, "grad_norm": 1.7598364353179932, "learning_rate": 2.3084623090631447e-08, "logits/chosen": -2.9082295894622803, "logits/rejected": -2.89711856842041, "logps/chosen": -65.43981170654297, "logps/rejected": -71.70658874511719, "loss": 0.6723, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.13450407981872559, "rewards/margins": 0.04599494859576225, "rewards/rejected": -0.18049903213977814, "step": 3320 }, { "epoch": 2.2949689869055825, "grad_norm": 1.6961475610733032, "learning_rate": 2.2934727630557967e-08, "logits/chosen": -2.923133373260498, "logits/rejected": -2.899732828140259, "logps/chosen": -68.70796966552734, "logps/rejected": -69.78541564941406, "loss": 0.6753, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.14205260574817657, "rewards/margins": 0.039856284856796265, "rewards/rejected": -0.18190889060497284, "step": 3330 }, { "epoch": 2.301860785665059, "grad_norm": 1.7416054010391235, "learning_rate": 2.278490689028765e-08, "logits/chosen": -2.918747663497925, "logits/rejected": -2.8960041999816895, "logps/chosen": -70.15538024902344, "logps/rejected": -70.29084777832031, "loss": 0.6752, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.13928914070129395, "rewards/margins": 0.04022783786058426, "rewards/rejected": -0.1795169860124588, "step": 3340 }, { "epoch": 2.3087525844245347, "grad_norm": 1.7422951459884644, "learning_rate": 2.263516629020784e-08, "logits/chosen": -2.8484020233154297, "logits/rejected": -2.817312240600586, "logps/chosen": -69.0149154663086, "logps/rejected": -70.68285369873047, "loss": 0.6692, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.13205787539482117, "rewards/margins": 0.052597712725400925, "rewards/rejected": -0.1846555769443512, "step": 3350 }, { "epoch": 2.315644383184011, "grad_norm": 1.6877411603927612, "learning_rate": 2.2485511247806493e-08, "logits/chosen": -2.9063937664031982, "logits/rejected": -2.8901989459991455, "logps/chosen": -67.65453338623047, "logps/rejected": -71.82198333740234, "loss": 0.6759, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.1382322609424591, "rewards/margins": 0.03894598037004471, "rewards/rejected": -0.17717823386192322, "step": 3360 }, { "epoch": 2.3225361819434873, "grad_norm": 1.9009865522384644, "learning_rate": 2.233594717747614e-08, "logits/chosen": -2.884256362915039, "logits/rejected": -2.8540446758270264, "logps/chosen": -65.9334716796875, "logps/rejected": -69.4367904663086, "loss": 0.669, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.12853553891181946, "rewards/margins": 0.05261304974555969, "rewards/rejected": -0.18114858865737915, "step": 3370 }, { "epoch": 2.3294279807029636, "grad_norm": 1.962171196937561, "learning_rate": 2.2186479490318026e-08, "logits/chosen": -2.8672804832458496, "logits/rejected": -2.8503494262695312, "logps/chosen": -66.8774185180664, "logps/rejected": -71.82426452636719, "loss": 0.6721, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.14432063698768616, "rewards/margins": 0.04692777991294861, "rewards/rejected": -0.19124841690063477, "step": 3380 }, { "epoch": 2.3363197794624395, "grad_norm": 1.7798832654953003, "learning_rate": 2.203711359394635e-08, "logits/chosen": -2.8842272758483887, "logits/rejected": -2.8633220195770264, "logps/chosen": -67.9321060180664, "logps/rejected": -69.65855407714844, "loss": 0.6768, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.14562222361564636, "rewards/margins": 0.03673269599676132, "rewards/rejected": -0.18235492706298828, "step": 3390 }, { "epoch": 2.343211578221916, "grad_norm": 1.8642452955245972, "learning_rate": 2.1887854892292585e-08, "logits/chosen": -2.847668170928955, "logits/rejected": -2.836843967437744, "logps/chosen": -67.62361145019531, "logps/rejected": -72.31847381591797, "loss": 0.6746, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.14204531908035278, "rewards/margins": 0.041754744946956635, "rewards/rejected": -0.18380005657672882, "step": 3400 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -3.000178575515747, "eval_logits/rejected": -2.9943742752075195, "eval_logps/chosen": -67.92466735839844, "eval_logps/rejected": -74.72300720214844, "eval_loss": 0.6828265190124512, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -0.092127725481987, "eval_rewards/margins": 0.023301174864172935, "eval_rewards/rejected": -0.11542889475822449, "eval_runtime": 383.2341, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 3400 }, { "epoch": 2.350103376981392, "grad_norm": 1.7967503070831299, "learning_rate": 2.1738708785409993e-08, "logits/chosen": -2.9103665351867676, "logits/rejected": -2.8803939819335938, "logps/chosen": -68.15880584716797, "logps/rejected": -70.93882751464844, "loss": 0.6704, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.13531838357448578, "rewards/margins": 0.050173722207546234, "rewards/rejected": -0.18549209833145142, "step": 3410 }, { "epoch": 2.3569951757408685, "grad_norm": 1.703116774559021, "learning_rate": 2.1589680669278273e-08, "logits/chosen": -2.8994393348693848, "logits/rejected": -2.8751065731048584, "logps/chosen": -68.01866149902344, "logps/rejected": -70.9808578491211, "loss": 0.6739, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.13914243876934052, "rewards/margins": 0.04300328344106674, "rewards/rejected": -0.18214571475982666, "step": 3420 }, { "epoch": 2.3638869745003444, "grad_norm": 1.7121405601501465, "learning_rate": 2.14407759356083e-08, "logits/chosen": -2.874730348587036, "logits/rejected": -2.8425934314727783, "logps/chosen": -68.30497741699219, "logps/rejected": -71.5040054321289, "loss": 0.6712, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.13123171031475067, "rewards/margins": 0.04810131713747978, "rewards/rejected": -0.17933301627635956, "step": 3430 }, { "epoch": 2.3707787732598207, "grad_norm": 1.9272888898849487, "learning_rate": 2.1291999971647077e-08, "logits/chosen": -2.9677257537841797, "logits/rejected": -2.9411613941192627, "logps/chosen": -69.03953552246094, "logps/rejected": -72.7578353881836, "loss": 0.6695, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.13276907801628113, "rewards/margins": 0.05193784832954407, "rewards/rejected": -0.1847069263458252, "step": 3440 }, { "epoch": 2.377670572019297, "grad_norm": 1.7714530229568481, "learning_rate": 2.1143358159982836e-08, "logits/chosen": -2.9841716289520264, "logits/rejected": -2.9558756351470947, "logps/chosen": -69.5213394165039, "logps/rejected": -71.89505767822266, "loss": 0.6694, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.13687150180339813, "rewards/margins": 0.05198253318667412, "rewards/rejected": -0.18885403871536255, "step": 3450 }, { "epoch": 2.3845623707787733, "grad_norm": 1.7453669309616089, "learning_rate": 2.0994855878350274e-08, "logits/chosen": -2.938234329223633, "logits/rejected": -2.9147391319274902, "logps/chosen": -69.4118423461914, "logps/rejected": -71.44087219238281, "loss": 0.6694, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.13282287120819092, "rewards/margins": 0.052209965884685516, "rewards/rejected": -0.18503285944461823, "step": 3460 }, { "epoch": 2.3914541695382496, "grad_norm": 1.8331879377365112, "learning_rate": 2.084649849943604e-08, "logits/chosen": -2.8791732788085938, "logits/rejected": -2.852694034576416, "logps/chosen": -68.8432846069336, "logps/rejected": -71.63152313232422, "loss": 0.6718, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1510583609342575, "rewards/margins": 0.0482449047267437, "rewards/rejected": -0.1993032693862915, "step": 3470 }, { "epoch": 2.3983459682977255, "grad_norm": 1.8445225954055786, "learning_rate": 2.0698291390684307e-08, "logits/chosen": -2.8736281394958496, "logits/rejected": -2.846642255783081, "logps/chosen": -70.47149658203125, "logps/rejected": -70.84956359863281, "loss": 0.6713, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.13976088166236877, "rewards/margins": 0.048269350081682205, "rewards/rejected": -0.18803022801876068, "step": 3480 }, { "epoch": 2.405237767057202, "grad_norm": 1.7324934005737305, "learning_rate": 2.0550239914102593e-08, "logits/chosen": -2.8452301025390625, "logits/rejected": -2.826636552810669, "logps/chosen": -66.49258422851562, "logps/rejected": -71.50048828125, "loss": 0.6707, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.13444240391254425, "rewards/margins": 0.049656014889478683, "rewards/rejected": -0.18409840762615204, "step": 3490 }, { "epoch": 2.412129565816678, "grad_norm": 1.7781845331192017, "learning_rate": 2.0402349426067798e-08, "logits/chosen": -2.859943389892578, "logits/rejected": -2.83974027633667, "logps/chosen": -69.46940612792969, "logps/rejected": -72.5849838256836, "loss": 0.6718, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14154665172100067, "rewards/margins": 0.04743730276823044, "rewards/rejected": -0.1889839619398117, "step": 3500 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -2.994781494140625, "eval_logits/rejected": -2.988985061645508, "eval_logps/chosen": -68.3349609375, "eval_logps/rejected": -75.22129821777344, "eval_loss": 0.6824442148208618, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.09623068571090698, "eval_rewards/margins": 0.024181002750992775, "eval_rewards/rejected": -0.1204117015004158, "eval_runtime": 382.8762, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 3500 }, { "epoch": 2.4190213645761545, "grad_norm": 1.915948510169983, "learning_rate": 2.0254625277132383e-08, "logits/chosen": -2.8515450954437256, "logits/rejected": -2.8271920680999756, "logps/chosen": -69.50716400146484, "logps/rejected": -69.76374816894531, "loss": 0.6727, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.1438041627407074, "rewards/margins": 0.045613210648298264, "rewards/rejected": -0.18941737711429596, "step": 3510 }, { "epoch": 2.425913163335631, "grad_norm": 1.8744242191314697, "learning_rate": 2.0107072811830786e-08, "logits/chosen": -2.8902485370635986, "logits/rejected": -2.859198808670044, "logps/chosen": -70.08866882324219, "logps/rejected": -71.80384063720703, "loss": 0.6713, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.14221274852752686, "rewards/margins": 0.04806675389409065, "rewards/rejected": -0.1902794986963272, "step": 3520 }, { "epoch": 2.4328049620951067, "grad_norm": 1.7954890727996826, "learning_rate": 1.9959697368486107e-08, "logits/chosen": -2.896947145462036, "logits/rejected": -2.8700497150421143, "logps/chosen": -67.94786071777344, "logps/rejected": -71.10428619384766, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": -0.1401069611310959, "rewards/margins": 0.04472484439611435, "rewards/rejected": -0.18483181297779083, "step": 3530 }, { "epoch": 2.439696760854583, "grad_norm": 1.9259308576583862, "learning_rate": 1.9812504279016915e-08, "logits/chosen": -2.9304146766662598, "logits/rejected": -2.9128377437591553, "logps/chosen": -68.08967590332031, "logps/rejected": -72.80455017089844, "loss": 0.6735, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.14570018649101257, "rewards/margins": 0.04376266524195671, "rewards/rejected": -0.18946287035942078, "step": 3540 }, { "epoch": 2.4465885596140593, "grad_norm": 1.769897699356079, "learning_rate": 1.9665498868744378e-08, "logits/chosen": -2.849870204925537, "logits/rejected": -2.822788715362549, "logps/chosen": -70.72515106201172, "logps/rejected": -72.82359313964844, "loss": 0.6702, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.1434292048215866, "rewards/margins": 0.05117066577076912, "rewards/rejected": -0.19459988176822662, "step": 3550 }, { "epoch": 2.4534803583735356, "grad_norm": 1.769606590270996, "learning_rate": 1.95186864561996e-08, "logits/chosen": -2.893775224685669, "logits/rejected": -2.8753342628479004, "logps/chosen": -69.7079849243164, "logps/rejected": -72.93263244628906, "loss": 0.6714, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.14495253562927246, "rewards/margins": 0.04877477139234543, "rewards/rejected": -0.1937273144721985, "step": 3560 }, { "epoch": 2.460372157133012, "grad_norm": 1.9118810892105103, "learning_rate": 1.9372072352931186e-08, "logits/chosen": -2.861694812774658, "logits/rejected": -2.8357861042022705, "logps/chosen": -69.12107849121094, "logps/rejected": -71.00257873535156, "loss": 0.6725, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.14611545205116272, "rewards/margins": 0.045843787491321564, "rewards/rejected": -0.1919592320919037, "step": 3570 }, { "epoch": 2.467263955892488, "grad_norm": 1.7760262489318848, "learning_rate": 1.9225661863313063e-08, "logits/chosen": -2.8602583408355713, "logits/rejected": -2.8373143672943115, "logps/chosen": -69.46070098876953, "logps/rejected": -72.57988739013672, "loss": 0.6693, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.13329514861106873, "rewards/margins": 0.05266965180635452, "rewards/rejected": -0.18596479296684265, "step": 3580 }, { "epoch": 2.474155754651964, "grad_norm": 1.9521143436431885, "learning_rate": 1.9079460284352616e-08, "logits/chosen": -2.918376922607422, "logits/rejected": -2.9052844047546387, "logps/chosen": -69.78836822509766, "logps/rejected": -74.49900817871094, "loss": 0.6688, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14706926047801971, "rewards/margins": 0.05400104448199272, "rewards/rejected": -0.20107030868530273, "step": 3590 }, { "epoch": 2.4810475534114405, "grad_norm": 1.84034264087677, "learning_rate": 1.893347290549901e-08, "logits/chosen": -2.919541597366333, "logits/rejected": -2.892209768295288, "logps/chosen": -68.73023223876953, "logps/rejected": -70.14763641357422, "loss": 0.672, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.14669956266880035, "rewards/margins": 0.046793993562459946, "rewards/rejected": -0.1934935748577118, "step": 3600 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -2.9901604652404785, "eval_logits/rejected": -2.984393358230591, "eval_logps/chosen": -68.84392547607422, "eval_logps/rejected": -75.79364776611328, "eval_loss": 0.6821951866149902, "eval_rewards/accuracies": 0.604786217212677, "eval_rewards/chosen": -0.101320281624794, "eval_rewards/margins": 0.024814918637275696, "eval_rewards/rejected": -0.1261351853609085, "eval_runtime": 383.447, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 3600 }, { "epoch": 2.4879393521709168, "grad_norm": 1.850649118423462, "learning_rate": 1.878770500845181e-08, "logits/chosen": -2.9053783416748047, "logits/rejected": -2.8845105171203613, "logps/chosen": -66.89297485351562, "logps/rejected": -71.99024200439453, "loss": 0.6711, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.14493872225284576, "rewards/margins": 0.048881541937589645, "rewards/rejected": -0.1938202679157257, "step": 3610 }, { "epoch": 2.4948311509303926, "grad_norm": 1.8169244527816772, "learning_rate": 1.8642161866969946e-08, "logits/chosen": -2.912402391433716, "logits/rejected": -2.893294095993042, "logps/chosen": -70.0347900390625, "logps/rejected": -72.41590118408203, "loss": 0.6699, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.14573247730731964, "rewards/margins": 0.05083359032869339, "rewards/rejected": -0.19656606018543243, "step": 3620 }, { "epoch": 2.501722949689869, "grad_norm": 1.8912067413330078, "learning_rate": 1.8496848746680856e-08, "logits/chosen": -2.891754627227783, "logits/rejected": -2.8633055686950684, "logps/chosen": -69.02076721191406, "logps/rejected": -72.99523162841797, "loss": 0.6679, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.14365717768669128, "rewards/margins": 0.05525166541337967, "rewards/rejected": -0.19890885055065155, "step": 3630 }, { "epoch": 2.5086147484493453, "grad_norm": 1.8045138120651245, "learning_rate": 1.8351770904890036e-08, "logits/chosen": -2.8837451934814453, "logits/rejected": -2.8583309650421143, "logps/chosen": -69.75531005859375, "logps/rejected": -73.83203125, "loss": 0.668, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.14414240419864655, "rewards/margins": 0.0553676001727581, "rewards/rejected": -0.19950999319553375, "step": 3640 }, { "epoch": 2.5155065472088216, "grad_norm": 1.7449434995651245, "learning_rate": 1.8206933590390786e-08, "logits/chosen": -2.848989963531494, "logits/rejected": -2.8349595069885254, "logps/chosen": -67.76069641113281, "logps/rejected": -71.66133117675781, "loss": 0.6748, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.15513445436954498, "rewards/margins": 0.04100924730300903, "rewards/rejected": -0.19614370167255402, "step": 3650 }, { "epoch": 2.5223983459682975, "grad_norm": 1.752121925354004, "learning_rate": 1.8062342043274324e-08, "logits/chosen": -2.8243088722229004, "logits/rejected": -2.8078298568725586, "logps/chosen": -68.06270599365234, "logps/rejected": -72.0756607055664, "loss": 0.6704, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.14946483075618744, "rewards/margins": 0.050057303160429, "rewards/rejected": -0.19952215254306793, "step": 3660 }, { "epoch": 2.529290144727774, "grad_norm": 1.8015022277832031, "learning_rate": 1.7918001494740237e-08, "logits/chosen": -2.865640640258789, "logits/rejected": -2.840625524520874, "logps/chosen": -66.65022277832031, "logps/rejected": -70.99957275390625, "loss": 0.6731, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.1517062932252884, "rewards/margins": 0.04449736699461937, "rewards/rejected": -0.19620364904403687, "step": 3670 }, { "epoch": 2.53618194348725, "grad_norm": 2.003652811050415, "learning_rate": 1.777391716690718e-08, "logits/chosen": -2.9066572189331055, "logits/rejected": -2.888805866241455, "logps/chosen": -69.33636474609375, "logps/rejected": -74.57949829101562, "loss": 0.669, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15022285282611847, "rewards/margins": 0.053607165813446045, "rewards/rejected": -0.20383000373840332, "step": 3680 }, { "epoch": 2.5430737422467264, "grad_norm": 1.800127387046814, "learning_rate": 1.7630094272623956e-08, "logits/chosen": -2.893311023712158, "logits/rejected": -2.873443603515625, "logps/chosen": -67.94241333007812, "logps/rejected": -72.42433166503906, "loss": 0.6668, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14471067488193512, "rewards/margins": 0.05841884762048721, "rewards/rejected": -0.20312952995300293, "step": 3690 }, { "epoch": 2.5499655410062028, "grad_norm": 1.9014350175857544, "learning_rate": 1.748653801528095e-08, "logits/chosen": -2.8817379474639893, "logits/rejected": -2.869598388671875, "logps/chosen": -69.22373962402344, "logps/rejected": -73.54571533203125, "loss": 0.6733, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.15719066560268402, "rewards/margins": 0.04415198415517807, "rewards/rejected": -0.2013426572084427, "step": 3700 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -2.985821008682251, "eval_logits/rejected": -2.980015993118286, "eval_logps/chosen": -69.1901626586914, "eval_logps/rejected": -76.1957778930664, "eval_loss": 0.6819745898246765, "eval_rewards/accuracies": 0.6031598448753357, "eval_rewards/chosen": -0.10478268563747406, "eval_rewards/margins": 0.025373850017786026, "eval_rewards/rejected": -0.1301565319299698, "eval_runtime": 383.2525, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 3700 }, { "epoch": 2.5568573397656786, "grad_norm": 2.0130882263183594, "learning_rate": 1.734325358862181e-08, "logits/chosen": -2.948828935623169, "logits/rejected": -2.9170308113098145, "logps/chosen": -71.13240051269531, "logps/rejected": -73.09709930419922, "loss": 0.668, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.15360499918460846, "rewards/margins": 0.05556768923997879, "rewards/rejected": -0.20917268097400665, "step": 3710 }, { "epoch": 2.563749138525155, "grad_norm": 1.8439056873321533, "learning_rate": 1.7200246176555605e-08, "logits/chosen": -2.85197377204895, "logits/rejected": -2.829017162322998, "logps/chosen": -69.58597564697266, "logps/rejected": -73.71815490722656, "loss": 0.6678, "rewards/accuracies": 0.671875, "rewards/chosen": -0.14899283647537231, "rewards/margins": 0.05594003200531006, "rewards/rejected": -0.20493288338184357, "step": 3720 }, { "epoch": 2.5706409372846313, "grad_norm": 1.7613168954849243, "learning_rate": 1.7057520952969256e-08, "logits/chosen": -2.81720232963562, "logits/rejected": -2.7989039421081543, "logps/chosen": -68.91320037841797, "logps/rejected": -74.29969787597656, "loss": 0.6744, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.15541571378707886, "rewards/margins": 0.042618438601493835, "rewards/rejected": -0.1980341374874115, "step": 3730 }, { "epoch": 2.5775327360441076, "grad_norm": 1.891718864440918, "learning_rate": 1.6915083081540328e-08, "logits/chosen": -2.9272730350494385, "logits/rejected": -2.902034044265747, "logps/chosen": -68.70365905761719, "logps/rejected": -72.29713439941406, "loss": 0.6667, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.14397987723350525, "rewards/margins": 0.05881938338279724, "rewards/rejected": -0.2027992457151413, "step": 3740 }, { "epoch": 2.584424534803584, "grad_norm": 1.9311151504516602, "learning_rate": 1.6772937715550234e-08, "logits/chosen": -2.9279627799987793, "logits/rejected": -2.906362771987915, "logps/chosen": -69.35621643066406, "logps/rejected": -73.2379379272461, "loss": 0.6718, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.1548062115907669, "rewards/margins": 0.047114502638578415, "rewards/rejected": -0.20192070305347443, "step": 3750 }, { "epoch": 2.59131633356306, "grad_norm": 1.97612726688385, "learning_rate": 1.6631089997697788e-08, "logits/chosen": -2.8803021907806396, "logits/rejected": -2.866580009460449, "logps/chosen": -68.98406219482422, "logps/rejected": -73.53099060058594, "loss": 0.6736, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.15731267631053925, "rewards/margins": 0.044545404613018036, "rewards/rejected": -0.20185807347297668, "step": 3760 }, { "epoch": 2.598208132322536, "grad_norm": 1.8041105270385742, "learning_rate": 1.648954505991315e-08, "logits/chosen": -2.916429042816162, "logits/rejected": -2.8861441612243652, "logps/chosen": -69.45280456542969, "logps/rejected": -72.04377746582031, "loss": 0.6703, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.14777907729148865, "rewards/margins": 0.05071541666984558, "rewards/rejected": -0.19849450886249542, "step": 3770 }, { "epoch": 2.6050999310820124, "grad_norm": 1.9320927858352661, "learning_rate": 1.634830802317215e-08, "logits/chosen": -2.8755927085876465, "logits/rejected": -2.855790376663208, "logps/chosen": -70.11579132080078, "logps/rejected": -72.98383331298828, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": -0.15885935723781586, "rewards/margins": 0.04062425717711449, "rewards/rejected": -0.19948363304138184, "step": 3780 }, { "epoch": 2.6119917298414888, "grad_norm": 1.912875771522522, "learning_rate": 1.6207383997311025e-08, "logits/chosen": -2.889190673828125, "logits/rejected": -2.866313934326172, "logps/chosen": -69.24533081054688, "logps/rejected": -74.37500762939453, "loss": 0.6675, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15136495232582092, "rewards/margins": 0.05730954557657242, "rewards/rejected": -0.20867450535297394, "step": 3790 }, { "epoch": 2.618883528600965, "grad_norm": 1.8908205032348633, "learning_rate": 1.6066778080841532e-08, "logits/chosen": -2.873011350631714, "logits/rejected": -2.8602213859558105, "logps/chosen": -69.9706802368164, "logps/rejected": -74.43916320800781, "loss": 0.6715, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.15705353021621704, "rewards/margins": 0.048626355826854706, "rewards/rejected": -0.20567989349365234, "step": 3800 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -2.98233962059021, "eval_logits/rejected": -2.9765419960021973, "eval_logps/chosen": -69.47762298583984, "eval_logps/rejected": -76.54092407226562, "eval_loss": 0.6817257404327393, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.10765726119279861, "eval_rewards/margins": 0.02595076523721218, "eval_rewards/rejected": -0.13360802829265594, "eval_runtime": 383.1892, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 3800 }, { "epoch": 2.625775327360441, "grad_norm": 1.945522427558899, "learning_rate": 1.5926495360766518e-08, "logits/chosen": -2.8317630290985107, "logits/rejected": -2.80340313911438, "logps/chosen": -71.25188446044922, "logps/rejected": -73.55364990234375, "loss": 0.6684, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.1576179713010788, "rewards/margins": 0.05499190092086792, "rewards/rejected": -0.2126098871231079, "step": 3810 }, { "epoch": 2.6326671261199173, "grad_norm": 1.8983831405639648, "learning_rate": 1.5786540912395846e-08, "logits/chosen": -2.9072282314300537, "logits/rejected": -2.880308151245117, "logps/chosen": -69.13433837890625, "logps/rejected": -73.61743927001953, "loss": 0.6669, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.15590843558311462, "rewards/margins": 0.057780999690294266, "rewards/rejected": -0.2136894166469574, "step": 3820 }, { "epoch": 2.6395589248793936, "grad_norm": 2.0345160961151123, "learning_rate": 1.564691979916278e-08, "logits/chosen": -2.86543607711792, "logits/rejected": -2.8393256664276123, "logps/chosen": -68.73051452636719, "logps/rejected": -73.54914855957031, "loss": 0.6663, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.1570970118045807, "rewards/margins": 0.060264430940151215, "rewards/rejected": -0.2173614501953125, "step": 3830 }, { "epoch": 2.64645072363887, "grad_norm": 1.901016354560852, "learning_rate": 1.5507637072440824e-08, "logits/chosen": -2.8357040882110596, "logits/rejected": -2.8217854499816895, "logps/chosen": -70.00772094726562, "logps/rejected": -75.92124938964844, "loss": 0.6703, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.15814658999443054, "rewards/margins": 0.050741057842969894, "rewards/rejected": -0.20888766646385193, "step": 3840 }, { "epoch": 2.6533425223983462, "grad_norm": 1.8903396129608154, "learning_rate": 1.5368697771360922e-08, "logits/chosen": -2.9293346405029297, "logits/rejected": -2.9006314277648926, "logps/chosen": -72.32757568359375, "logps/rejected": -74.44883728027344, "loss": 0.6683, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.16006004810333252, "rewards/margins": 0.055580876767635345, "rewards/rejected": -0.21564093232154846, "step": 3850 }, { "epoch": 2.660234321157822, "grad_norm": 1.9308605194091797, "learning_rate": 1.523010692262918e-08, "logits/chosen": -2.9361400604248047, "logits/rejected": -2.9061970710754395, "logps/chosen": -71.77800750732422, "logps/rejected": -74.0538330078125, "loss": 0.6685, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1560564637184143, "rewards/margins": 0.054051823914051056, "rewards/rejected": -0.21010828018188477, "step": 3860 }, { "epoch": 2.6671261199172984, "grad_norm": 1.8674836158752441, "learning_rate": 1.5091869540345003e-08, "logits/chosen": -2.870302200317383, "logits/rejected": -2.848585367202759, "logps/chosen": -72.25010681152344, "logps/rejected": -74.80943298339844, "loss": 0.6706, "rewards/accuracies": 0.625, "rewards/chosen": -0.15403147041797638, "rewards/margins": 0.05039002746343613, "rewards/rejected": -0.20442147552967072, "step": 3870 }, { "epoch": 2.6740179186767747, "grad_norm": 1.8031920194625854, "learning_rate": 1.495399062581966e-08, "logits/chosen": -2.835313320159912, "logits/rejected": -2.8142800331115723, "logps/chosen": -71.18692016601562, "logps/rejected": -73.02665710449219, "loss": 0.6715, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.15897247195243835, "rewards/margins": 0.048299357295036316, "rewards/rejected": -0.20727181434631348, "step": 3880 }, { "epoch": 2.6809097174362506, "grad_norm": 1.9062350988388062, "learning_rate": 1.481647516739537e-08, "logits/chosen": -2.867675304412842, "logits/rejected": -2.8461194038391113, "logps/chosen": -69.1014404296875, "logps/rejected": -74.23119354248047, "loss": 0.6687, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15956063568592072, "rewards/margins": 0.05391395092010498, "rewards/rejected": -0.2134745866060257, "step": 3890 }, { "epoch": 2.687801516195727, "grad_norm": 1.752568244934082, "learning_rate": 1.4679328140264815e-08, "logits/chosen": -2.892151355743408, "logits/rejected": -2.8774960041046143, "logps/chosen": -69.58900451660156, "logps/rejected": -74.52362060546875, "loss": 0.6709, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1596180498600006, "rewards/margins": 0.0496763214468956, "rewards/rejected": -0.2092943638563156, "step": 3900 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -2.9786884784698486, "eval_logits/rejected": -2.972900390625, "eval_logps/chosen": -69.7330322265625, "eval_logps/rejected": -76.83737182617188, "eval_loss": 0.6815603971481323, "eval_rewards/accuracies": 0.6019981503486633, "eval_rewards/chosen": -0.11021139472723007, "eval_rewards/margins": 0.026361122727394104, "eval_rewards/rejected": -0.13657251000404358, "eval_runtime": 383.1892, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 3900 }, { "epoch": 2.6946933149552033, "grad_norm": 1.9829176664352417, "learning_rate": 1.4542554506291169e-08, "logits/chosen": -2.8860998153686523, "logits/rejected": -2.866718292236328, "logps/chosen": -69.78288269042969, "logps/rejected": -74.26182556152344, "loss": 0.6678, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.15368084609508514, "rewards/margins": 0.05583782121539116, "rewards/rejected": -0.2095186710357666, "step": 3910 }, { "epoch": 2.7015851137146796, "grad_norm": 1.8440921306610107, "learning_rate": 1.4406159213828506e-08, "logits/chosen": -2.8718748092651367, "logits/rejected": -2.852900505065918, "logps/chosen": -69.7797622680664, "logps/rejected": -75.06401824951172, "loss": 0.6663, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.1587846577167511, "rewards/margins": 0.058991938829422, "rewards/rejected": -0.2177765816450119, "step": 3920 }, { "epoch": 2.708476912474156, "grad_norm": 1.9059720039367676, "learning_rate": 1.427014719754287e-08, "logits/chosen": -2.8898565769195557, "logits/rejected": -2.8734545707702637, "logps/chosen": -70.47734069824219, "logps/rejected": -75.37660217285156, "loss": 0.6718, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.16778220236301422, "rewards/margins": 0.047993969172239304, "rewards/rejected": -0.21577616035938263, "step": 3930 }, { "epoch": 2.7153687112336318, "grad_norm": 1.9197648763656616, "learning_rate": 1.4134523378233698e-08, "logits/chosen": -2.8931357860565186, "logits/rejected": -2.8730826377868652, "logps/chosen": -68.95951843261719, "logps/rejected": -73.48743438720703, "loss": 0.6729, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16806289553642273, "rewards/margins": 0.04564516991376877, "rewards/rejected": -0.2137080729007721, "step": 3940 }, { "epoch": 2.722260509993108, "grad_norm": 1.8608909845352173, "learning_rate": 1.3999292662655754e-08, "logits/chosen": -2.9023194313049316, "logits/rejected": -2.877852439880371, "logps/chosen": -70.27208709716797, "logps/rejected": -72.99089813232422, "loss": 0.6702, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15660075843334198, "rewards/margins": 0.050805769860744476, "rewards/rejected": -0.20740655064582825, "step": 3950 }, { "epoch": 2.7291523087525844, "grad_norm": 1.8643169403076172, "learning_rate": 1.3864459943341675e-08, "logits/chosen": -2.901879072189331, "logits/rejected": -2.891754627227783, "logps/chosen": -68.13871765136719, "logps/rejected": -74.3793716430664, "loss": 0.6737, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.17261925339698792, "rewards/margins": 0.044561196118593216, "rewards/rejected": -0.21718044579029083, "step": 3960 }, { "epoch": 2.7360441075120607, "grad_norm": 1.9297481775283813, "learning_rate": 1.3730030098424927e-08, "logits/chosen": -2.88895583152771, "logits/rejected": -2.864968776702881, "logps/chosen": -69.42606353759766, "logps/rejected": -73.60870361328125, "loss": 0.669, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.16050942242145538, "rewards/margins": 0.05298808962106705, "rewards/rejected": -0.21349748969078064, "step": 3970 }, { "epoch": 2.742935906271537, "grad_norm": 1.9109336137771606, "learning_rate": 1.3596007991463298e-08, "logits/chosen": -2.861509323120117, "logits/rejected": -2.832160472869873, "logps/chosen": -70.3248519897461, "logps/rejected": -72.97554016113281, "loss": 0.6654, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15822330117225647, "rewards/margins": 0.06100432202219963, "rewards/rejected": -0.2192276418209076, "step": 3980 }, { "epoch": 2.749827705031013, "grad_norm": 1.7587929964065552, "learning_rate": 1.3462398471262992e-08, "logits/chosen": -2.870997905731201, "logits/rejected": -2.8450169563293457, "logps/chosen": -72.17587280273438, "logps/rejected": -74.4552993774414, "loss": 0.667, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.1500275880098343, "rewards/margins": 0.05822940915822983, "rewards/rejected": -0.20825700461864471, "step": 3990 }, { "epoch": 2.7567195037904892, "grad_norm": 1.9692848920822144, "learning_rate": 1.3329206371703166e-08, "logits/chosen": -2.8925511837005615, "logits/rejected": -2.8751959800720215, "logps/chosen": -71.35633850097656, "logps/rejected": -76.25764465332031, "loss": 0.6696, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.16284766793251038, "rewards/margins": 0.05212429165840149, "rewards/rejected": -0.21497198939323425, "step": 4000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -2.975595235824585, "eval_logits/rejected": -2.9697747230529785, "eval_logps/chosen": -70.03456115722656, "eval_logps/rejected": -77.1830825805664, "eval_loss": 0.6813851594924927, "eval_rewards/accuracies": 0.6031598448753357, "eval_rewards/chosen": -0.11322659999132156, "eval_rewards/margins": 0.026802998036146164, "eval_rewards/rejected": -0.14002960920333862, "eval_runtime": 383.1778, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 4000 }, { "epoch": 2.7636113025499656, "grad_norm": 1.9502569437026978, "learning_rate": 1.3196436511561027e-08, "logits/chosen": -2.9590206146240234, "logits/rejected": -2.9436488151550293, "logps/chosen": -69.94823455810547, "logps/rejected": -74.14200592041016, "loss": 0.6688, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16159674525260925, "rewards/margins": 0.05501946061849594, "rewards/rejected": -0.2166161984205246, "step": 4010 }, { "epoch": 2.770503101309442, "grad_norm": 1.898844838142395, "learning_rate": 1.3064093694337552e-08, "logits/chosen": -2.8715386390686035, "logits/rejected": -2.845707654953003, "logps/chosen": -71.83735656738281, "logps/rejected": -75.913818359375, "loss": 0.6637, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": -0.1576516330242157, "rewards/margins": 0.06487510353326797, "rewards/rejected": -0.22252674400806427, "step": 4020 }, { "epoch": 2.777394900068918, "grad_norm": 1.9604547023773193, "learning_rate": 1.2932182708083659e-08, "logits/chosen": -2.8502888679504395, "logits/rejected": -2.8310112953186035, "logps/chosen": -69.13433074951172, "logps/rejected": -74.49159240722656, "loss": 0.6682, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.15932469069957733, "rewards/margins": 0.05556952953338623, "rewards/rejected": -0.21489422023296356, "step": 4030 }, { "epoch": 2.784286698828394, "grad_norm": 1.8763211965560913, "learning_rate": 1.2800708325226967e-08, "logits/chosen": -2.9329864978790283, "logits/rejected": -2.9085593223571777, "logps/chosen": -69.78377532958984, "logps/rejected": -74.4681625366211, "loss": 0.6681, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.16460582613945007, "rewards/margins": 0.055522024631500244, "rewards/rejected": -0.22012785077095032, "step": 4040 }, { "epoch": 2.7911784975878704, "grad_norm": 1.9302481412887573, "learning_rate": 1.2669675302399174e-08, "logits/chosen": -2.8411152362823486, "logits/rejected": -2.805752992630005, "logps/chosen": -71.40072631835938, "logps/rejected": -75.06214904785156, "loss": 0.6654, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.16166746616363525, "rewards/margins": 0.061172910034656525, "rewards/rejected": -0.22284038364887238, "step": 4050 }, { "epoch": 2.7980702963473467, "grad_norm": 1.9212311506271362, "learning_rate": 1.2539088380263958e-08, "logits/chosen": -2.8404088020324707, "logits/rejected": -2.8218789100646973, "logps/chosen": -68.89048767089844, "logps/rejected": -74.90914154052734, "loss": 0.6682, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.1601947844028473, "rewards/margins": 0.05537404865026474, "rewards/rejected": -0.21556885540485382, "step": 4060 }, { "epoch": 2.804962095106823, "grad_norm": 2.0027828216552734, "learning_rate": 1.240895228334542e-08, "logits/chosen": -2.920994281768799, "logits/rejected": -2.898411750793457, "logps/chosen": -71.86430358886719, "logps/rejected": -76.43321228027344, "loss": 0.6673, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.16580648720264435, "rewards/margins": 0.05794544145464897, "rewards/rejected": -0.22375193238258362, "step": 4070 }, { "epoch": 2.8118538938662994, "grad_norm": 1.9354565143585205, "learning_rate": 1.2279271719857196e-08, "logits/chosen": -2.897157669067383, "logits/rejected": -2.8823513984680176, "logps/chosen": -69.52801513671875, "logps/rejected": -74.7330322265625, "loss": 0.6728, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.16293780505657196, "rewards/margins": 0.045937877148389816, "rewards/rejected": -0.20887568593025208, "step": 4080 }, { "epoch": 2.8187456926257752, "grad_norm": 1.8890588283538818, "learning_rate": 1.2150051381532137e-08, "logits/chosen": -2.8874728679656982, "logits/rejected": -2.8626890182495117, "logps/chosen": -70.39543151855469, "logps/rejected": -74.44825744628906, "loss": 0.6658, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.15262214839458466, "rewards/margins": 0.060657352209091187, "rewards/rejected": -0.21327951550483704, "step": 4090 }, { "epoch": 2.8256374913852516, "grad_norm": 1.9345576763153076, "learning_rate": 1.2021295943452495e-08, "logits/chosen": -2.8419175148010254, "logits/rejected": -2.82415509223938, "logps/chosen": -69.63661193847656, "logps/rejected": -74.35877990722656, "loss": 0.6687, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.15847401320934296, "rewards/margins": 0.05471924692392349, "rewards/rejected": -0.21319326758384705, "step": 4100 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -2.9728803634643555, "eval_logits/rejected": -2.967038631439209, "eval_logps/chosen": -70.25259399414062, "eval_logps/rejected": -77.45008087158203, "eval_loss": 0.681178092956543, "eval_rewards/accuracies": 0.604786217212677, "eval_rewards/chosen": -0.11540694534778595, "eval_rewards/margins": 0.02729259990155697, "eval_rewards/rejected": -0.14269953966140747, "eval_runtime": 383.0187, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 4100 }, { "epoch": 2.832529290144728, "grad_norm": 1.9737968444824219, "learning_rate": 1.1893010063880853e-08, "logits/chosen": -2.8246657848358154, "logits/rejected": -2.8063836097717285, "logps/chosen": -70.7244644165039, "logps/rejected": -74.58245849609375, "loss": 0.6722, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1705399453639984, "rewards/margins": 0.04741028696298599, "rewards/rejected": -0.2179502248764038, "step": 4110 }, { "epoch": 2.8394210889042037, "grad_norm": 1.899269461631775, "learning_rate": 1.1765198384091577e-08, "logits/chosen": -2.8298823833465576, "logits/rejected": -2.8139114379882812, "logps/chosen": -68.4229507446289, "logps/rejected": -75.86543273925781, "loss": 0.6629, "rewards/accuracies": 0.703125, "rewards/chosen": -0.1606307029724121, "rewards/margins": 0.06716320663690567, "rewards/rejected": -0.22779390215873718, "step": 4120 }, { "epoch": 2.84631288766368, "grad_norm": 1.9691749811172485, "learning_rate": 1.1637865528202845e-08, "logits/chosen": -2.9220492839813232, "logits/rejected": -2.89398193359375, "logps/chosen": -72.20655822753906, "logps/rejected": -74.08243560791016, "loss": 0.6705, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1643935590982437, "rewards/margins": 0.051238853484392166, "rewards/rejected": -0.21563240885734558, "step": 4130 }, { "epoch": 2.8532046864231564, "grad_norm": 1.8378872871398926, "learning_rate": 1.1511016103009425e-08, "logits/chosen": -2.8455426692962646, "logits/rejected": -2.824030637741089, "logps/chosen": -71.28382873535156, "logps/rejected": -75.72223663330078, "loss": 0.6695, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1700962334871292, "rewards/margins": 0.05256285145878792, "rewards/rejected": -0.22265906631946564, "step": 4140 }, { "epoch": 2.8600964851826327, "grad_norm": 1.85801100730896, "learning_rate": 1.1384654697815973e-08, "logits/chosen": -2.8683829307556152, "logits/rejected": -2.8490817546844482, "logps/chosen": -70.61616516113281, "logps/rejected": -74.1768569946289, "loss": 0.6721, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.1679346263408661, "rewards/margins": 0.04759329929947853, "rewards/rejected": -0.21552792191505432, "step": 4150 }, { "epoch": 2.866988283942109, "grad_norm": 1.881890892982483, "learning_rate": 1.1258785884270972e-08, "logits/chosen": -2.829272985458374, "logits/rejected": -2.7988953590393066, "logps/chosen": -73.9723129272461, "logps/rejected": -75.06248474121094, "loss": 0.6668, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.16338932514190674, "rewards/margins": 0.058902084827423096, "rewards/rejected": -0.22229138016700745, "step": 4160 }, { "epoch": 2.873880082701585, "grad_norm": 1.86675226688385, "learning_rate": 1.1133414216201372e-08, "logits/chosen": -2.824122905731201, "logits/rejected": -2.8125271797180176, "logps/chosen": -68.92567443847656, "logps/rejected": -73.56863403320312, "loss": 0.6727, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.16737547516822815, "rewards/margins": 0.046836256980895996, "rewards/rejected": -0.21421174705028534, "step": 4170 }, { "epoch": 2.8807718814610612, "grad_norm": 1.7521086931228638, "learning_rate": 1.1008544229447836e-08, "logits/chosen": -2.9104981422424316, "logits/rejected": -2.885585069656372, "logps/chosen": -69.56279754638672, "logps/rejected": -73.14405822753906, "loss": 0.6702, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1627635508775711, "rewards/margins": 0.05127249285578728, "rewards/rejected": -0.2140360325574875, "step": 4180 }, { "epoch": 2.8876636802205375, "grad_norm": 1.9362108707427979, "learning_rate": 1.0884180441700588e-08, "logits/chosen": -2.8215527534484863, "logits/rejected": -2.8023669719696045, "logps/chosen": -71.02534484863281, "logps/rejected": -76.00444030761719, "loss": 0.6701, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.16891708970069885, "rewards/margins": 0.051739465445280075, "rewards/rejected": -0.22065654397010803, "step": 4190 }, { "epoch": 2.894555478980014, "grad_norm": 2.1264278888702393, "learning_rate": 1.0760327352336024e-08, "logits/chosen": -2.8850719928741455, "logits/rejected": -2.8598134517669678, "logps/chosen": -70.82664489746094, "logps/rejected": -74.36808776855469, "loss": 0.6692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16466885805130005, "rewards/margins": 0.05422654002904892, "rewards/rejected": -0.21889539062976837, "step": 4200 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -2.9707539081573486, "eval_logits/rejected": -2.964906930923462, "eval_logps/chosen": -70.37152862548828, "eval_logps/rejected": -77.60812377929688, "eval_loss": 0.681008517742157, "eval_rewards/accuracies": 0.607342004776001, "eval_rewards/chosen": -0.11659645289182663, "eval_rewards/margins": 0.02768353372812271, "eval_rewards/rejected": -0.14427998661994934, "eval_runtime": 383.2413, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 4200 }, { "epoch": 2.90144727773949, "grad_norm": 1.9289238452911377, "learning_rate": 1.0636989442253914e-08, "logits/chosen": -2.8836934566497803, "logits/rejected": -2.858093738555908, "logps/chosen": -70.48356628417969, "logps/rejected": -74.24246215820312, "loss": 0.6681, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.16563943028450012, "rewards/margins": 0.056425075978040695, "rewards/rejected": -0.22206449508666992, "step": 4210 }, { "epoch": 2.908339076498966, "grad_norm": 2.0953094959259033, "learning_rate": 1.0514171173715245e-08, "logits/chosen": -2.918147087097168, "logits/rejected": -2.888167142868042, "logps/chosen": -70.07975006103516, "logps/rejected": -72.76868438720703, "loss": 0.6707, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.1693296879529953, "rewards/margins": 0.05121892690658569, "rewards/rejected": -0.2205486297607422, "step": 4220 }, { "epoch": 2.9152308752584424, "grad_norm": 1.9358092546463013, "learning_rate": 1.039187699018085e-08, "logits/chosen": -2.8874099254608154, "logits/rejected": -2.8688271045684814, "logps/chosen": -70.86736297607422, "logps/rejected": -74.66913604736328, "loss": 0.6734, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1724914014339447, "rewards/margins": 0.044794049113988876, "rewards/rejected": -0.21728546917438507, "step": 4230 }, { "epoch": 2.9221226740179187, "grad_norm": 2.0595898628234863, "learning_rate": 1.0270111316150585e-08, "logits/chosen": -2.8881657123565674, "logits/rejected": -2.871654510498047, "logps/chosen": -70.68883514404297, "logps/rejected": -75.88733673095703, "loss": 0.6669, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15845754742622375, "rewards/margins": 0.0580957755446434, "rewards/rejected": -0.21655333042144775, "step": 4240 }, { "epoch": 2.929014472777395, "grad_norm": 1.858399748802185, "learning_rate": 1.0148878557003299e-08, "logits/chosen": -2.9464240074157715, "logits/rejected": -2.9220833778381348, "logps/chosen": -70.81522369384766, "logps/rejected": -75.079833984375, "loss": 0.6669, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16308936476707458, "rewards/margins": 0.05801185220479965, "rewards/rejected": -0.22110123932361603, "step": 4250 }, { "epoch": 2.9359062715368713, "grad_norm": 2.005319595336914, "learning_rate": 1.0028183098837409e-08, "logits/chosen": -2.8827686309814453, "logits/rejected": -2.867227554321289, "logps/chosen": -72.57582092285156, "logps/rejected": -75.32026672363281, "loss": 0.6746, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.17912040650844574, "rewards/margins": 0.04286336153745651, "rewards/rejected": -0.22198376059532166, "step": 4260 }, { "epoch": 2.942798070296347, "grad_norm": 1.9747921228408813, "learning_rate": 9.908029308312266e-09, "logits/chosen": -2.8668737411499023, "logits/rejected": -2.8398964405059814, "logps/chosen": -71.49706268310547, "logps/rejected": -75.99536895751953, "loss": 0.6683, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1682967096567154, "rewards/margins": 0.055577345192432404, "rewards/rejected": -0.2238740473985672, "step": 4270 }, { "epoch": 2.9496898690558235, "grad_norm": 1.840579628944397, "learning_rate": 9.788421532490134e-09, "logits/chosen": -2.86275053024292, "logits/rejected": -2.841568946838379, "logps/chosen": -71.6749496459961, "logps/rejected": -75.15389251708984, "loss": 0.6692, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.17632083594799042, "rewards/margins": 0.05344356968998909, "rewards/rejected": -0.2297644317150116, "step": 4280 }, { "epoch": 2.9565816678153, "grad_norm": 1.8896485567092896, "learning_rate": 9.669364098678912e-09, "logits/chosen": -2.8953022956848145, "logits/rejected": -2.871859550476074, "logps/chosen": -69.26052856445312, "logps/rejected": -75.07076263427734, "loss": 0.6637, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.16137142479419708, "rewards/margins": 0.0651010274887085, "rewards/rejected": -0.2264724224805832, "step": 4290 }, { "epoch": 2.963473466574776, "grad_norm": 2.062659978866577, "learning_rate": 9.550861314275613e-09, "logits/chosen": -2.914487361907959, "logits/rejected": -2.9010860919952393, "logps/chosen": -71.20018768310547, "logps/rejected": -75.36875915527344, "loss": 0.6742, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.17198467254638672, "rewards/margins": 0.043615758419036865, "rewards/rejected": -0.21560044586658478, "step": 4300 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -2.9686903953552246, "eval_logits/rejected": -2.962855339050293, "eval_logps/chosen": -70.55128479003906, "eval_logps/rejected": -77.80999755859375, "eval_loss": 0.6809155344963074, "eval_rewards/accuracies": 0.6026951670646667, "eval_rewards/chosen": -0.1183938980102539, "eval_rewards/margins": 0.027904745191335678, "eval_rewards/rejected": -0.1462986320257187, "eval_runtime": 383.0559, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 4300 }, { "epoch": 2.9703652653342525, "grad_norm": 1.9695403575897217, "learning_rate": 9.432917466610505e-09, "logits/chosen": -2.8627686500549316, "logits/rejected": -2.860344409942627, "logps/chosen": -70.58271789550781, "logps/rejected": -77.49148559570312, "loss": 0.6707, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.17099115252494812, "rewards/margins": 0.051348716020584106, "rewards/rejected": -0.22233989834785461, "step": 4310 }, { "epoch": 2.9772570640937284, "grad_norm": 2.013923168182373, "learning_rate": 9.315536822791976e-09, "logits/chosen": -2.8354594707489014, "logits/rejected": -2.8117318153381348, "logps/chosen": -71.29680633544922, "logps/rejected": -75.37150573730469, "loss": 0.6675, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16862986981868744, "rewards/margins": 0.056559968739748, "rewards/rejected": -0.22518984973430634, "step": 4320 }, { "epoch": 2.9841488628532047, "grad_norm": 1.9870107173919678, "learning_rate": 9.198723629552205e-09, "logits/chosen": -2.8919081687927246, "logits/rejected": -2.8731167316436768, "logps/chosen": -73.26725769042969, "logps/rejected": -75.7055892944336, "loss": 0.6717, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.17416629195213318, "rewards/margins": 0.04818163812160492, "rewards/rejected": -0.2223479300737381, "step": 4330 }, { "epoch": 2.991040661612681, "grad_norm": 1.8119279146194458, "learning_rate": 9.08248211309346e-09, "logits/chosen": -2.9039347171783447, "logits/rejected": -2.8826346397399902, "logps/chosen": -70.63749694824219, "logps/rejected": -75.0828628540039, "loss": 0.6673, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.16708947718143463, "rewards/margins": 0.05737733095884323, "rewards/rejected": -0.22446680068969727, "step": 4340 }, { "epoch": 2.997932460372157, "grad_norm": 1.9515116214752197, "learning_rate": 8.966816478935255e-09, "logits/chosen": -2.879502773284912, "logits/rejected": -2.8620200157165527, "logps/chosen": -70.68488311767578, "logps/rejected": -75.58467864990234, "loss": 0.6697, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.1710232049226761, "rewards/margins": 0.05235626548528671, "rewards/rejected": -0.2233794629573822, "step": 4350 }, { "epoch": 3.004824259131633, "grad_norm": 1.7856117486953735, "learning_rate": 8.851730911762168e-09, "logits/chosen": -2.869312286376953, "logits/rejected": -2.8474583625793457, "logps/chosen": -70.70429992675781, "logps/rejected": -75.0107650756836, "loss": 0.6681, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1649990677833557, "rewards/margins": 0.05564570426940918, "rewards/rejected": -0.2206447571516037, "step": 4360 }, { "epoch": 3.0117160578911095, "grad_norm": 2.004338264465332, "learning_rate": 8.73722957527242e-09, "logits/chosen": -2.858567714691162, "logits/rejected": -2.8346304893493652, "logps/chosen": -71.86139678955078, "logps/rejected": -76.08316040039062, "loss": 0.6719, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.1683572232723236, "rewards/margins": 0.04870215058326721, "rewards/rejected": -0.21705937385559082, "step": 4370 }, { "epoch": 3.018607856650586, "grad_norm": 3.9235353469848633, "learning_rate": 8.623316612027284e-09, "logits/chosen": -2.880403995513916, "logits/rejected": -2.854773759841919, "logps/chosen": -69.40467834472656, "logps/rejected": -74.3254623413086, "loss": 0.6672, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.16873586177825928, "rewards/margins": 0.058157872408628464, "rewards/rejected": -0.22689373791217804, "step": 4380 }, { "epoch": 3.025499655410062, "grad_norm": 1.9829630851745605, "learning_rate": 8.509996143301196e-09, "logits/chosen": -2.8708505630493164, "logits/rejected": -2.851668357849121, "logps/chosen": -72.15980529785156, "logps/rejected": -74.9146728515625, "loss": 0.6697, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.17295928299427032, "rewards/margins": 0.05339939519762993, "rewards/rejected": -0.22635868191719055, "step": 4390 }, { "epoch": 3.032391454169538, "grad_norm": 1.994016408920288, "learning_rate": 8.397272268932618e-09, "logits/chosen": -2.8321261405944824, "logits/rejected": -2.807309150695801, "logps/chosen": -71.56242370605469, "logps/rejected": -76.18682098388672, "loss": 0.6652, "rewards/accuracies": 0.671875, "rewards/chosen": -0.16867855191230774, "rewards/margins": 0.061644814908504486, "rewards/rejected": -0.23032338917255402, "step": 4400 }, { "epoch": 3.032391454169538, "eval_logits/chosen": -2.966364860534668, "eval_logits/rejected": -2.9605610370635986, "eval_logps/chosen": -70.62178039550781, "eval_logps/rejected": -77.91410064697266, "eval_loss": 0.6807657480239868, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.1190989762544632, "eval_rewards/margins": 0.028240786865353584, "eval_rewards/rejected": -0.14733976125717163, "eval_runtime": 383.7643, "eval_samples_per_second": 11.215, "eval_steps_per_second": 1.402, "step": 4400 }, { "epoch": 3.0392832529290144, "grad_norm": 2.224198818206787, "learning_rate": 8.285149067175734e-09, "logits/chosen": -2.8952767848968506, "logits/rejected": -2.872647762298584, "logps/chosen": -71.42662048339844, "logps/rejected": -75.63797760009766, "loss": 0.6684, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.1663835644721985, "rewards/margins": 0.05600534752011299, "rewards/rejected": -0.22238890826702118, "step": 4410 }, { "epoch": 3.0461750516884907, "grad_norm": 1.964881420135498, "learning_rate": 8.173630594552924e-09, "logits/chosen": -2.8521366119384766, "logits/rejected": -2.823565721511841, "logps/chosen": -71.7008056640625, "logps/rejected": -74.2966537475586, "loss": 0.6646, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16019544005393982, "rewards/margins": 0.06328865140676498, "rewards/rejected": -0.2234840840101242, "step": 4420 }, { "epoch": 3.053066850447967, "grad_norm": 2.0228092670440674, "learning_rate": 8.062720885707983e-09, "logits/chosen": -2.8849539756774902, "logits/rejected": -2.8587076663970947, "logps/chosen": -69.99919128417969, "logps/rejected": -74.09209442138672, "loss": 0.6692, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.17324639856815338, "rewards/margins": 0.05324423313140869, "rewards/rejected": -0.22649061679840088, "step": 4430 }, { "epoch": 3.0599586492074433, "grad_norm": 2.038457155227661, "learning_rate": 7.95242395326011e-09, "logits/chosen": -2.8559212684631348, "logits/rejected": -2.8325462341308594, "logps/chosen": -71.53230285644531, "logps/rejected": -75.818603515625, "loss": 0.6647, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1681286245584488, "rewards/margins": 0.06264480948448181, "rewards/rejected": -0.2307734191417694, "step": 4440 }, { "epoch": 3.066850447966919, "grad_norm": 1.6918925046920776, "learning_rate": 7.842743787658812e-09, "logits/chosen": -2.8379852771759033, "logits/rejected": -2.8106400966644287, "logps/chosen": -70.26661682128906, "logps/rejected": -74.20497131347656, "loss": 0.6676, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16945037245750427, "rewards/margins": 0.05712301656603813, "rewards/rejected": -0.2265733927488327, "step": 4450 }, { "epoch": 3.0737422467263955, "grad_norm": 1.9788740873336792, "learning_rate": 7.733684357039492e-09, "logits/chosen": -2.835134506225586, "logits/rejected": -2.8150548934936523, "logps/chosen": -68.41593933105469, "logps/rejected": -74.57231903076172, "loss": 0.6661, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.16778665781021118, "rewards/margins": 0.059963636100292206, "rewards/rejected": -0.22775030136108398, "step": 4460 }, { "epoch": 3.080634045485872, "grad_norm": 1.8905788660049438, "learning_rate": 7.62524960707986e-09, "logits/chosen": -2.8406686782836914, "logits/rejected": -2.8143975734710693, "logps/chosen": -71.17180633544922, "logps/rejected": -75.61317443847656, "loss": 0.6654, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.1681421846151352, "rewards/margins": 0.06106606125831604, "rewards/rejected": -0.22920826077461243, "step": 4470 }, { "epoch": 3.087525844245348, "grad_norm": 1.8831181526184082, "learning_rate": 7.517443460857229e-09, "logits/chosen": -2.8299612998962402, "logits/rejected": -2.8093419075012207, "logps/chosen": -71.86965942382812, "logps/rejected": -78.294189453125, "loss": 0.6669, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17062735557556152, "rewards/margins": 0.05883735418319702, "rewards/rejected": -0.22946472465991974, "step": 4480 }, { "epoch": 3.0944176430048245, "grad_norm": 1.9670896530151367, "learning_rate": 7.410269818706574e-09, "logits/chosen": -2.829634428024292, "logits/rejected": -2.8150954246520996, "logps/chosen": -70.93474578857422, "logps/rejected": -76.74848175048828, "loss": 0.6685, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1692148596048355, "rewards/margins": 0.05438335984945297, "rewards/rejected": -0.22359821200370789, "step": 4490 }, { "epoch": 3.1013094417643003, "grad_norm": 1.9488273859024048, "learning_rate": 7.303732558079379e-09, "logits/chosen": -2.8560550212860107, "logits/rejected": -2.83811616897583, "logps/chosen": -70.2449951171875, "logps/rejected": -75.04199981689453, "loss": 0.6659, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.16745367646217346, "rewards/margins": 0.05973466485738754, "rewards/rejected": -0.2271883189678192, "step": 4500 }, { "epoch": 3.1013094417643003, "eval_logits/chosen": -2.9645352363586426, "eval_logits/rejected": -2.958704710006714, "eval_logps/chosen": -70.77420043945312, "eval_logps/rejected": -78.07850646972656, "eval_loss": 0.6807343363761902, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.12062300741672516, "eval_rewards/margins": 0.028360825031995773, "eval_rewards/rejected": -0.14898382127285004, "eval_runtime": 384.7109, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.398, "step": 4500 }, { "epoch": 3.1082012405237767, "grad_norm": 2.031484365463257, "learning_rate": 7.197835533403404e-09, "logits/chosen": -2.9088339805603027, "logits/rejected": -2.889122724533081, "logps/chosen": -70.42329406738281, "logps/rejected": -74.39561462402344, "loss": 0.6699, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.16674701869487762, "rewards/margins": 0.051632173359394073, "rewards/rejected": -0.2183792144060135, "step": 4510 }, { "epoch": 3.115093039283253, "grad_norm": 1.962405800819397, "learning_rate": 7.092582575943218e-09, "logits/chosen": -2.785108804702759, "logits/rejected": -2.767988443374634, "logps/chosen": -72.75899505615234, "logps/rejected": -77.11186981201172, "loss": 0.6705, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.17647066712379456, "rewards/margins": 0.05130830407142639, "rewards/rejected": -0.22777898609638214, "step": 4520 }, { "epoch": 3.1219848380427293, "grad_norm": 1.9696760177612305, "learning_rate": 6.9879774936615645e-09, "logits/chosen": -2.8616669178009033, "logits/rejected": -2.839104175567627, "logps/chosen": -70.83036041259766, "logps/rejected": -74.57025909423828, "loss": 0.667, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.16625390946865082, "rewards/margins": 0.05807176232337952, "rewards/rejected": -0.22432568669319153, "step": 4530 }, { "epoch": 3.128876636802205, "grad_norm": 2.048768997192383, "learning_rate": 6.884024071081632e-09, "logits/chosen": -2.8838143348693848, "logits/rejected": -2.8573451042175293, "logps/chosen": -72.37026977539062, "logps/rejected": -75.74974060058594, "loss": 0.67, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.1692103147506714, "rewards/margins": 0.05285166576504707, "rewards/rejected": -0.22206199169158936, "step": 4540 }, { "epoch": 3.1357684355616815, "grad_norm": 2.06854248046875, "learning_rate": 6.7807260691501196e-09, "logits/chosen": -2.806816577911377, "logits/rejected": -2.786256790161133, "logps/chosen": -71.26956939697266, "logps/rejected": -76.16363525390625, "loss": 0.6672, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.1713106483221054, "rewards/margins": 0.05790756270289421, "rewards/rejected": -0.22921821475028992, "step": 4550 }, { "epoch": 3.142660234321158, "grad_norm": 1.9100197553634644, "learning_rate": 6.67808722510112e-09, "logits/chosen": -2.8889167308807373, "logits/rejected": -2.867492914199829, "logps/chosen": -70.50053405761719, "logps/rejected": -75.88932037353516, "loss": 0.6697, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.17244485020637512, "rewards/margins": 0.05211521312594414, "rewards/rejected": -0.22456006705760956, "step": 4560 }, { "epoch": 3.149552033080634, "grad_norm": 1.968579649925232, "learning_rate": 6.576111252321001e-09, "logits/chosen": -2.8007147312164307, "logits/rejected": -2.7867965698242188, "logps/chosen": -71.63652038574219, "logps/rejected": -76.29466247558594, "loss": 0.6685, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.17593082785606384, "rewards/margins": 0.056103091686964035, "rewards/rejected": -0.23203392326831818, "step": 4570 }, { "epoch": 3.1564438318401105, "grad_norm": 2.1014175415039062, "learning_rate": 6.474801840213995e-09, "logits/chosen": -2.862433910369873, "logits/rejected": -2.843538761138916, "logps/chosen": -71.13722229003906, "logps/rejected": -78.19841003417969, "loss": 0.6661, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.17347559332847595, "rewards/margins": 0.060491181910037994, "rewards/rejected": -0.23396675288677216, "step": 4580 }, { "epoch": 3.1633356305995863, "grad_norm": 1.9174317121505737, "learning_rate": 6.3741626540687156e-09, "logits/chosen": -2.921653985977173, "logits/rejected": -2.9019834995269775, "logps/chosen": -71.20884704589844, "logps/rejected": -77.82212829589844, "loss": 0.6668, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.17796102166175842, "rewards/margins": 0.05937998369336128, "rewards/rejected": -0.2373410165309906, "step": 4590 }, { "epoch": 3.1702274293590627, "grad_norm": 1.9432258605957031, "learning_rate": 6.274197334925596e-09, "logits/chosen": -2.7800862789154053, "logits/rejected": -2.7622694969177246, "logps/chosen": -66.99742889404297, "logps/rejected": -74.23822784423828, "loss": 0.666, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.15835708379745483, "rewards/margins": 0.060429029166698456, "rewards/rejected": -0.2187860906124115, "step": 4600 }, { "epoch": 3.1702274293590627, "eval_logits/chosen": -2.962764024734497, "eval_logits/rejected": -2.956948757171631, "eval_logps/chosen": -70.95821380615234, "eval_logps/rejected": -78.30269622802734, "eval_loss": 0.680549681186676, "eval_rewards/accuracies": 0.6061803102493286, "eval_rewards/chosen": -0.12246322631835938, "eval_rewards/margins": 0.028762485831975937, "eval_rewards/rejected": -0.15122570097446442, "eval_runtime": 384.0424, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 4600 }, { "epoch": 3.177119228118539, "grad_norm": 1.9577919244766235, "learning_rate": 6.174909499445125e-09, "logits/chosen": -2.8804211616516113, "logits/rejected": -2.8621325492858887, "logps/chosen": -71.59983825683594, "logps/rejected": -76.90446472167969, "loss": 0.6696, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.17211928963661194, "rewards/margins": 0.05256058648228645, "rewards/rejected": -0.22467990219593048, "step": 4610 }, { "epoch": 3.1840110268780153, "grad_norm": 2.180481433868408, "learning_rate": 6.07630273977699e-09, "logits/chosen": -2.917825222015381, "logits/rejected": -2.8942019939422607, "logps/chosen": -72.30247497558594, "logps/rejected": -77.46430969238281, "loss": 0.6632, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.17260505259037018, "rewards/margins": 0.06701594591140747, "rewards/rejected": -0.23962099850177765, "step": 4620 }, { "epoch": 3.190902825637491, "grad_norm": 2.110973358154297, "learning_rate": 5.978380623430152e-09, "logits/chosen": -2.8573174476623535, "logits/rejected": -2.842115640640259, "logps/chosen": -70.88710021972656, "logps/rejected": -76.09236145019531, "loss": 0.6684, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.1769961416721344, "rewards/margins": 0.05527294799685478, "rewards/rejected": -0.23226909339427948, "step": 4630 }, { "epoch": 3.1977946243969675, "grad_norm": 1.9261893033981323, "learning_rate": 5.8811466931437624e-09, "logits/chosen": -2.845853328704834, "logits/rejected": -2.8217179775238037, "logps/chosen": -71.8774185180664, "logps/rejected": -75.34031677246094, "loss": 0.6692, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.1736145317554474, "rewards/margins": 0.05382418632507324, "rewards/rejected": -0.22743871808052063, "step": 4640 }, { "epoch": 3.204686423156444, "grad_norm": 1.9915244579315186, "learning_rate": 5.784604466758955e-09, "logits/chosen": -2.8629231452941895, "logits/rejected": -2.836418390274048, "logps/chosen": -72.9046859741211, "logps/rejected": -75.8555679321289, "loss": 0.6666, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.17054606974124908, "rewards/margins": 0.058819033205509186, "rewards/rejected": -0.22936508059501648, "step": 4650 }, { "epoch": 3.21157822191592, "grad_norm": 1.9014475345611572, "learning_rate": 5.688757437091632e-09, "logits/chosen": -2.8450183868408203, "logits/rejected": -2.828110694885254, "logps/chosen": -71.74043273925781, "logps/rejected": -76.6547622680664, "loss": 0.6664, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.173049196600914, "rewards/margins": 0.05977267026901245, "rewards/rejected": -0.23282186686992645, "step": 4660 }, { "epoch": 3.2184700206753964, "grad_norm": 1.9660433530807495, "learning_rate": 5.593609071806061e-09, "logits/chosen": -2.86476469039917, "logits/rejected": -2.8434720039367676, "logps/chosen": -71.73765563964844, "logps/rejected": -75.04331970214844, "loss": 0.6698, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.17371220886707306, "rewards/margins": 0.05325447395443916, "rewards/rejected": -0.22696669399738312, "step": 4670 }, { "epoch": 3.2253618194348723, "grad_norm": 1.90015709400177, "learning_rate": 5.499162813289407e-09, "logits/chosen": -2.8755059242248535, "logits/rejected": -2.860504388809204, "logps/chosen": -70.20982360839844, "logps/rejected": -75.3865966796875, "loss": 0.6682, "rewards/accuracies": 0.640625, "rewards/chosen": -0.17817863821983337, "rewards/margins": 0.05694174766540527, "rewards/rejected": -0.23512041568756104, "step": 4680 }, { "epoch": 3.2322536181943486, "grad_norm": 2.0442395210266113, "learning_rate": 5.405422078527233e-09, "logits/chosen": -2.8403515815734863, "logits/rejected": -2.8225433826446533, "logps/chosen": -73.66438293457031, "logps/rejected": -78.09309387207031, "loss": 0.6695, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1825314462184906, "rewards/margins": 0.053516704589128494, "rewards/rejected": -0.23604817688465118, "step": 4690 }, { "epoch": 3.239145416953825, "grad_norm": 2.1131651401519775, "learning_rate": 5.312390258979841e-09, "logits/chosen": -2.85274600982666, "logits/rejected": -2.826772451400757, "logps/chosen": -72.89948272705078, "logps/rejected": -75.7077865600586, "loss": 0.6644, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.16620318591594696, "rewards/margins": 0.0634000152349472, "rewards/rejected": -0.22960320115089417, "step": 4700 }, { "epoch": 3.239145416953825, "eval_logits/chosen": -2.961543083190918, "eval_logits/rejected": -2.955733299255371, "eval_logps/chosen": -71.07854461669922, "eval_logps/rejected": -78.44540405273438, "eval_loss": 0.6804670691490173, "eval_rewards/accuracies": 0.6059479713439941, "eval_rewards/chosen": -0.12366647273302078, "eval_rewards/margins": 0.028986424207687378, "eval_rewards/rejected": -0.15265290439128876, "eval_runtime": 384.7361, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 4700 }, { "epoch": 3.2460372157133013, "grad_norm": 1.9549802541732788, "learning_rate": 5.220070720459571e-09, "logits/chosen": -2.833397388458252, "logits/rejected": -2.8211517333984375, "logps/chosen": -71.19657135009766, "logps/rejected": -75.4433822631836, "loss": 0.6727, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.1790410727262497, "rewards/margins": 0.04663190245628357, "rewards/rejected": -0.22567300498485565, "step": 4710 }, { "epoch": 3.2529290144727776, "grad_norm": 2.0317866802215576, "learning_rate": 5.1284668030090485e-09, "logits/chosen": -2.8180994987487793, "logits/rejected": -2.796031951904297, "logps/chosen": -71.46773529052734, "logps/rejected": -76.26029205322266, "loss": 0.6673, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16776514053344727, "rewards/margins": 0.057730309665203094, "rewards/rejected": -0.22549545764923096, "step": 4720 }, { "epoch": 3.2598208132322535, "grad_norm": 1.9394210577011108, "learning_rate": 5.037581820780335e-09, "logits/chosen": -2.8745484352111816, "logits/rejected": -2.840742826461792, "logps/chosen": -74.25862121582031, "logps/rejected": -76.86700439453125, "loss": 0.6645, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.17741765081882477, "rewards/margins": 0.06402811408042908, "rewards/rejected": -0.24144577980041504, "step": 4730 }, { "epoch": 3.26671261199173, "grad_norm": 1.9664959907531738, "learning_rate": 4.947419061915037e-09, "logits/chosen": -2.8466360569000244, "logits/rejected": -2.8232128620147705, "logps/chosen": -72.50716400146484, "logps/rejected": -76.36012268066406, "loss": 0.6652, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.16871440410614014, "rewards/margins": 0.06182708218693733, "rewards/rejected": -0.23054146766662598, "step": 4740 }, { "epoch": 3.273604410751206, "grad_norm": 1.9588463306427002, "learning_rate": 4.857981788425305e-09, "logits/chosen": -2.9071202278137207, "logits/rejected": -2.8845252990722656, "logps/chosen": -71.82292175292969, "logps/rejected": -77.28089141845703, "loss": 0.6655, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.17996972799301147, "rewards/margins": 0.06187788397073746, "rewards/rejected": -0.24184763431549072, "step": 4750 }, { "epoch": 3.2804962095106824, "grad_norm": 1.9996967315673828, "learning_rate": 4.7692732360758634e-09, "logits/chosen": -2.8332200050354004, "logits/rejected": -2.8054213523864746, "logps/chosen": -72.18595123291016, "logps/rejected": -76.13382720947266, "loss": 0.6674, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.17056967318058014, "rewards/margins": 0.05764305591583252, "rewards/rejected": -0.22821271419525146, "step": 4760 }, { "epoch": 3.2873880082701588, "grad_norm": 2.0007715225219727, "learning_rate": 4.68129661426693e-09, "logits/chosen": -2.8611721992492676, "logits/rejected": -2.8385655879974365, "logps/chosen": -70.39287567138672, "logps/rejected": -75.25458526611328, "loss": 0.6703, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.17333902418613434, "rewards/margins": 0.05287405848503113, "rewards/rejected": -0.22621306777000427, "step": 4770 }, { "epoch": 3.2942798070296346, "grad_norm": 2.041116237640381, "learning_rate": 4.594055105918071e-09, "logits/chosen": -2.8724751472473145, "logits/rejected": -2.847742795944214, "logps/chosen": -71.12216186523438, "logps/rejected": -76.76944732666016, "loss": 0.6639, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.17703698575496674, "rewards/margins": 0.06517043709754944, "rewards/rejected": -0.24220743775367737, "step": 4780 }, { "epoch": 3.301171605789111, "grad_norm": 1.9137120246887207, "learning_rate": 4.507551867353093e-09, "logits/chosen": -2.8578104972839355, "logits/rejected": -2.8448033332824707, "logps/chosen": -71.29341125488281, "logps/rejected": -77.37324523925781, "loss": 0.6675, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.17409269511699677, "rewards/margins": 0.058341849595308304, "rewards/rejected": -0.23243455588817596, "step": 4790 }, { "epoch": 3.3080634045485873, "grad_norm": 2.0397465229034424, "learning_rate": 4.4217900281858236e-09, "logits/chosen": -2.8879497051239014, "logits/rejected": -2.8621504306793213, "logps/chosen": -72.70024108886719, "logps/rejected": -76.26146697998047, "loss": 0.6685, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.18176187574863434, "rewards/margins": 0.05570191144943237, "rewards/rejected": -0.2374637871980667, "step": 4800 }, { "epoch": 3.3080634045485873, "eval_logits/chosen": -2.9605331420898438, "eval_logits/rejected": -2.9547009468078613, "eval_logps/chosen": -71.16743469238281, "eval_logps/rejected": -78.54407501220703, "eval_loss": 0.6804293394088745, "eval_rewards/accuracies": 0.6052509546279907, "eval_rewards/chosen": -0.12455536425113678, "eval_rewards/margins": 0.029084132984280586, "eval_rewards/rejected": -0.15363949537277222, "eval_runtime": 384.6634, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 4800 }, { "epoch": 3.3149552033080636, "grad_norm": 1.8728365898132324, "learning_rate": 4.336772691206877e-09, "logits/chosen": -2.9032750129699707, "logits/rejected": -2.8813698291778564, "logps/chosen": -71.68211364746094, "logps/rejected": -76.97119903564453, "loss": 0.6647, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.17238381505012512, "rewards/margins": 0.06265820562839508, "rewards/rejected": -0.235042005777359, "step": 4810 }, { "epoch": 3.3218470020675395, "grad_norm": 1.91660475730896, "learning_rate": 4.252502932271423e-09, "logits/chosen": -2.8713207244873047, "logits/rejected": -2.847630262374878, "logps/chosen": -72.81474304199219, "logps/rejected": -77.03373718261719, "loss": 0.6657, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.17396162450313568, "rewards/margins": 0.06114625185728073, "rewards/rejected": -0.2351078987121582, "step": 4820 }, { "epoch": 3.328738800827016, "grad_norm": 1.9135323762893677, "learning_rate": 4.168983800187892e-09, "logits/chosen": -2.8688721656799316, "logits/rejected": -2.8506999015808105, "logps/chosen": -72.09941101074219, "logps/rejected": -75.62064361572266, "loss": 0.6721, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.1842980682849884, "rewards/margins": 0.047958116978406906, "rewards/rejected": -0.23225617408752441, "step": 4830 }, { "epoch": 3.335630599586492, "grad_norm": 1.9678868055343628, "learning_rate": 4.086218316607654e-09, "logits/chosen": -2.8234031200408936, "logits/rejected": -2.7995800971984863, "logps/chosen": -73.12089538574219, "logps/rejected": -78.00308990478516, "loss": 0.6659, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.17803508043289185, "rewards/margins": 0.06067787855863571, "rewards/rejected": -0.23871295154094696, "step": 4840 }, { "epoch": 3.3425223983459684, "grad_norm": 2.06929349899292, "learning_rate": 4.004209475915732e-09, "logits/chosen": -2.900522470474243, "logits/rejected": -2.8752999305725098, "logps/chosen": -71.86921691894531, "logps/rejected": -76.11266326904297, "loss": 0.665, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.17574940621852875, "rewards/margins": 0.06345906853675842, "rewards/rejected": -0.23920850455760956, "step": 4850 }, { "epoch": 3.3494141971054443, "grad_norm": 1.893026351928711, "learning_rate": 3.9229602451224554e-09, "logits/chosen": -2.8786511421203613, "logits/rejected": -2.85194993019104, "logps/chosen": -73.745849609375, "logps/rejected": -76.59969329833984, "loss": 0.6696, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18215957283973694, "rewards/margins": 0.05310459062457085, "rewards/rejected": -0.2352641522884369, "step": 4860 }, { "epoch": 3.3563059958649206, "grad_norm": 2.0597856044769287, "learning_rate": 3.8424735637560965e-09, "logits/chosen": -2.8564953804016113, "logits/rejected": -2.8326077461242676, "logps/chosen": -72.75129699707031, "logps/rejected": -76.74140167236328, "loss": 0.6684, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.17731256783008575, "rewards/margins": 0.055709563195705414, "rewards/rejected": -0.23302213847637177, "step": 4870 }, { "epoch": 3.363197794624397, "grad_norm": 2.080430269241333, "learning_rate": 3.762752343756531e-09, "logits/chosen": -2.852489471435547, "logits/rejected": -2.828636646270752, "logps/chosen": -71.9737319946289, "logps/rejected": -76.49903869628906, "loss": 0.6687, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.17835985124111176, "rewards/margins": 0.054557718336582184, "rewards/rejected": -0.23291757702827454, "step": 4880 }, { "epoch": 3.3700895933838733, "grad_norm": 1.9526216983795166, "learning_rate": 3.683799469369919e-09, "logits/chosen": -2.9188039302825928, "logits/rejected": -2.8961312770843506, "logps/chosen": -72.2201156616211, "logps/rejected": -77.29249572753906, "loss": 0.6636, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.17195996642112732, "rewards/margins": 0.06554337590932846, "rewards/rejected": -0.23750336468219757, "step": 4890 }, { "epoch": 3.3769813921433496, "grad_norm": 2.124680995941162, "learning_rate": 3.6056177970442995e-09, "logits/chosen": -2.9035253524780273, "logits/rejected": -2.8732361793518066, "logps/chosen": -70.88807678222656, "logps/rejected": -75.14522552490234, "loss": 0.6651, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.16689960658550262, "rewards/margins": 0.06185099482536316, "rewards/rejected": -0.22875061631202698, "step": 4900 }, { "epoch": 3.3769813921433496, "eval_logits/chosen": -2.9597761631011963, "eval_logits/rejected": -2.953935384750366, "eval_logps/chosen": -71.20720672607422, "eval_logps/rejected": -78.60298919677734, "eval_loss": 0.6803425550460815, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -0.12495309114456177, "eval_rewards/margins": 0.02927566133439541, "eval_rewards/rejected": -0.15422874689102173, "eval_runtime": 384.4963, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 4900 }, { "epoch": 3.3838731909028255, "grad_norm": 1.95071542263031, "learning_rate": 3.528210155326289e-09, "logits/chosen": -2.8900961875915527, "logits/rejected": -2.863687515258789, "logps/chosen": -71.15364074707031, "logps/rejected": -75.98387145996094, "loss": 0.6635, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16726848483085632, "rewards/margins": 0.06759568303823471, "rewards/rejected": -0.23486416041851044, "step": 4910 }, { "epoch": 3.3907649896623018, "grad_norm": 2.1279611587524414, "learning_rate": 3.4515793447587342e-09, "logits/chosen": -2.8516290187835693, "logits/rejected": -2.824493169784546, "logps/chosen": -72.39521789550781, "logps/rejected": -74.69700622558594, "loss": 0.6694, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.1769055426120758, "rewards/margins": 0.05311691761016846, "rewards/rejected": -0.23002246022224426, "step": 4920 }, { "epoch": 3.397656788421778, "grad_norm": 2.018932580947876, "learning_rate": 3.3757281377793793e-09, "logits/chosen": -2.8963863849639893, "logits/rejected": -2.8794009685516357, "logps/chosen": -70.18763732910156, "logps/rejected": -76.82774353027344, "loss": 0.6704, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.16775056719779968, "rewards/margins": 0.05121440812945366, "rewards/rejected": -0.21896497905254364, "step": 4930 }, { "epoch": 3.4045485871812544, "grad_norm": 2.0735890865325928, "learning_rate": 3.3006592786205793e-09, "logits/chosen": -2.91428804397583, "logits/rejected": -2.893552303314209, "logps/chosen": -70.70872497558594, "logps/rejected": -76.18577575683594, "loss": 0.6633, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.16524307429790497, "rewards/margins": 0.06604791432619095, "rewards/rejected": -0.23129098117351532, "step": 4940 }, { "epoch": 3.4114403859407307, "grad_norm": 1.972838044166565, "learning_rate": 3.226375483210017e-09, "logits/chosen": -2.8463799953460693, "logits/rejected": -2.8235983848571777, "logps/chosen": -70.33956146240234, "logps/rejected": -76.64471435546875, "loss": 0.6657, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17534813284873962, "rewards/margins": 0.0618339404463768, "rewards/rejected": -0.23718206584453583, "step": 4950 }, { "epoch": 3.4183321847002066, "grad_norm": 2.009845018386841, "learning_rate": 3.152879439072409e-09, "logits/chosen": -2.8654890060424805, "logits/rejected": -2.842156410217285, "logps/chosen": -73.66585540771484, "logps/rejected": -74.80780029296875, "loss": 0.6702, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.18469473719596863, "rewards/margins": 0.052499882876873016, "rewards/rejected": -0.23719461262226105, "step": 4960 }, { "epoch": 3.425223983459683, "grad_norm": 2.0229947566986084, "learning_rate": 3.0801738052323224e-09, "logits/chosen": -2.8899052143096924, "logits/rejected": -2.8697075843811035, "logps/chosen": -71.63045501708984, "logps/rejected": -76.57025909423828, "loss": 0.6658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17431309819221497, "rewards/margins": 0.060704149305820465, "rewards/rejected": -0.23501725494861603, "step": 4970 }, { "epoch": 3.4321157822191593, "grad_norm": 1.9472365379333496, "learning_rate": 3.0082612121179434e-09, "logits/chosen": -2.884010076522827, "logits/rejected": -2.8511712551116943, "logps/chosen": -71.83345794677734, "logps/rejected": -74.82569885253906, "loss": 0.6641, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.16892683506011963, "rewards/margins": 0.06412702798843384, "rewards/rejected": -0.23305387794971466, "step": 4980 }, { "epoch": 3.4390075809786356, "grad_norm": 1.9328975677490234, "learning_rate": 2.9371442614659096e-09, "logits/chosen": -2.8814871311187744, "logits/rejected": -2.8637237548828125, "logps/chosen": -73.43669128417969, "logps/rejected": -78.71479797363281, "loss": 0.6691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18112850189208984, "rewards/margins": 0.05441933870315552, "rewards/rejected": -0.23554782569408417, "step": 4990 }, { "epoch": 3.445899379738112, "grad_norm": 1.970980167388916, "learning_rate": 2.8668255262271985e-09, "logits/chosen": -2.8773982524871826, "logits/rejected": -2.8519210815429688, "logps/chosen": -71.43087768554688, "logps/rejected": -74.98974609375, "loss": 0.6689, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.16911523044109344, "rewards/margins": 0.05404984951019287, "rewards/rejected": -0.22316507995128632, "step": 5000 }, { "epoch": 3.445899379738112, "eval_logits/chosen": -2.958827495574951, "eval_logits/rejected": -2.9530227184295654, "eval_logps/chosen": -71.25025939941406, "eval_logps/rejected": -78.64763641357422, "eval_loss": 0.6803478598594666, "eval_rewards/accuracies": 0.6061803102493286, "eval_rewards/chosen": -0.125383660197258, "eval_rewards/margins": 0.029291439801454544, "eval_rewards/rejected": -0.15467508137226105, "eval_runtime": 384.6251, "eval_samples_per_second": 11.19, "eval_steps_per_second": 1.399, "step": 5000 }, { "epoch": 3.4527911784975878, "grad_norm": 2.0206921100616455, "learning_rate": 2.7973075504740317e-09, "logits/chosen": -2.8844199180603027, "logits/rejected": -2.8580071926116943, "logps/chosen": -70.33631134033203, "logps/rejected": -73.91645812988281, "loss": 0.6693, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.175685316324234, "rewards/margins": 0.05314185470342636, "rewards/rejected": -0.22882714867591858, "step": 5010 }, { "epoch": 3.459682977257064, "grad_norm": 2.157289981842041, "learning_rate": 2.7285928493078174e-09, "logits/chosen": -2.8504130840301514, "logits/rejected": -2.8349666595458984, "logps/chosen": -72.86569213867188, "logps/rejected": -76.90276336669922, "loss": 0.6695, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.1828501969575882, "rewards/margins": 0.053870148956775665, "rewards/rejected": -0.23672032356262207, "step": 5020 }, { "epoch": 3.4665747760165404, "grad_norm": 1.8843340873718262, "learning_rate": 2.660683908768191e-09, "logits/chosen": -2.8714962005615234, "logits/rejected": -2.8483474254608154, "logps/chosen": -69.8622055053711, "logps/rejected": -74.93540954589844, "loss": 0.6681, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17769171297550201, "rewards/margins": 0.05554303526878357, "rewards/rejected": -0.23323476314544678, "step": 5030 }, { "epoch": 3.4734665747760167, "grad_norm": 2.0690054893493652, "learning_rate": 2.5935831857430283e-09, "logits/chosen": -2.8173763751983643, "logits/rejected": -2.7945051193237305, "logps/chosen": -72.05995178222656, "logps/rejected": -75.03291320800781, "loss": 0.6697, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.17713730037212372, "rewards/margins": 0.052860189229249954, "rewards/rejected": -0.22999748587608337, "step": 5040 }, { "epoch": 3.4803583735354926, "grad_norm": 1.9472019672393799, "learning_rate": 2.527293107879602e-09, "logits/chosen": -2.8307979106903076, "logits/rejected": -2.808256149291992, "logps/chosen": -69.93370056152344, "logps/rejected": -74.14271545410156, "loss": 0.6692, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.17580489814281464, "rewards/margins": 0.053970418870449066, "rewards/rejected": -0.2297753393650055, "step": 5050 }, { "epoch": 3.487250172294969, "grad_norm": 1.9730207920074463, "learning_rate": 2.4618160734967168e-09, "logits/chosen": -2.8944809436798096, "logits/rejected": -2.868194341659546, "logps/chosen": -71.67644500732422, "logps/rejected": -76.07598114013672, "loss": 0.6715, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18108238279819489, "rewards/margins": 0.048933278769254684, "rewards/rejected": -0.23001566529273987, "step": 5060 }, { "epoch": 3.4941419710544452, "grad_norm": 2.145080804824829, "learning_rate": 2.397154451497957e-09, "logits/chosen": -2.890610694885254, "logits/rejected": -2.878953456878662, "logps/chosen": -72.08394622802734, "logps/rejected": -78.22209167480469, "loss": 0.6657, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1773787885904312, "rewards/margins": 0.061196696013212204, "rewards/rejected": -0.2385755032300949, "step": 5070 }, { "epoch": 3.5010337698139216, "grad_norm": 1.9887871742248535, "learning_rate": 2.333310581285988e-09, "logits/chosen": -2.844324827194214, "logits/rejected": -2.8153486251831055, "logps/chosen": -71.37538146972656, "logps/rejected": -78.40065002441406, "loss": 0.6598, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.167982816696167, "rewards/margins": 0.07294406741857529, "rewards/rejected": -0.2409268617630005, "step": 5080 }, { "epoch": 3.5079255685733974, "grad_norm": 2.055562973022461, "learning_rate": 2.27028677267789e-09, "logits/chosen": -2.8457157611846924, "logits/rejected": -2.827517032623291, "logps/chosen": -70.66600799560547, "logps/rejected": -75.45985412597656, "loss": 0.6653, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.17708957195281982, "rewards/margins": 0.06284669786691666, "rewards/rejected": -0.23993626236915588, "step": 5090 }, { "epoch": 3.5148173673328738, "grad_norm": 2.0824100971221924, "learning_rate": 2.2080853058216274e-09, "logits/chosen": -2.8466343879699707, "logits/rejected": -2.829312801361084, "logps/chosen": -70.18838500976562, "logps/rejected": -77.3112564086914, "loss": 0.6653, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.17904981970787048, "rewards/margins": 0.06212810426950455, "rewards/rejected": -0.24117791652679443, "step": 5100 }, { "epoch": 3.5148173673328738, "eval_logits/chosen": -2.9583287239074707, "eval_logits/rejected": -2.9524905681610107, "eval_logps/chosen": -71.27213287353516, "eval_logps/rejected": -78.69546508789062, "eval_loss": 0.6802259683609009, "eval_rewards/accuracies": 0.6050186157226562, "eval_rewards/chosen": -0.12560229003429413, "eval_rewards/margins": 0.029551101848483086, "eval_rewards/rejected": -0.15515340864658356, "eval_runtime": 384.4265, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 5100 }, { "epoch": 3.52170916609235, "grad_norm": 2.1451125144958496, "learning_rate": 2.1467084311135226e-09, "logits/chosen": -2.844409942626953, "logits/rejected": -2.8222174644470215, "logps/chosen": -72.57167053222656, "logps/rejected": -76.774658203125, "loss": 0.6686, "rewards/accuracies": 0.625, "rewards/chosen": -0.1831672042608261, "rewards/margins": 0.05601830407977104, "rewards/rejected": -0.23918552696704865, "step": 5110 }, { "epoch": 3.5286009648518264, "grad_norm": 1.9240217208862305, "learning_rate": 2.0861583691168637e-09, "logits/chosen": -2.879807710647583, "logits/rejected": -2.8652496337890625, "logps/chosen": -71.79307556152344, "logps/rejected": -75.7448501586914, "loss": 0.6686, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.17770828306674957, "rewards/margins": 0.05711061879992485, "rewards/rejected": -0.2348189353942871, "step": 5120 }, { "epoch": 3.5354927636113027, "grad_norm": 2.2481539249420166, "learning_rate": 2.0264373104815602e-09, "logits/chosen": -2.8342843055725098, "logits/rejected": -2.8115406036376953, "logps/chosen": -72.83387756347656, "logps/rejected": -76.70985412597656, "loss": 0.6676, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.17890271544456482, "rewards/margins": 0.05751532316207886, "rewards/rejected": -0.23641805350780487, "step": 5130 }, { "epoch": 3.5423845623707786, "grad_norm": 2.0398712158203125, "learning_rate": 1.967547415864862e-09, "logits/chosen": -2.87103009223938, "logits/rejected": -2.8476879596710205, "logps/chosen": -70.97459411621094, "logps/rejected": -76.2702865600586, "loss": 0.6642, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.17735496163368225, "rewards/margins": 0.06495282053947449, "rewards/rejected": -0.24230778217315674, "step": 5140 }, { "epoch": 3.549276361130255, "grad_norm": 1.9899929761886597, "learning_rate": 1.909490815853232e-09, "logits/chosen": -2.8462300300598145, "logits/rejected": -2.829967975616455, "logps/chosen": -71.70545959472656, "logps/rejected": -77.56938934326172, "loss": 0.6657, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.17894436419010162, "rewards/margins": 0.06172887235879898, "rewards/rejected": -0.24067322909832, "step": 5150 }, { "epoch": 3.5561681598897312, "grad_norm": 1.985331654548645, "learning_rate": 1.8522696108852348e-09, "logits/chosen": -2.8789024353027344, "logits/rejected": -2.862795352935791, "logps/chosen": -73.25022888183594, "logps/rejected": -76.40839385986328, "loss": 0.6727, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.17978349328041077, "rewards/margins": 0.0467870831489563, "rewards/rejected": -0.22657057642936707, "step": 5160 }, { "epoch": 3.5630599586492075, "grad_norm": 1.914067029953003, "learning_rate": 1.795885871175537e-09, "logits/chosen": -2.8835418224334717, "logits/rejected": -2.8561511039733887, "logps/chosen": -72.01161193847656, "logps/rejected": -76.25904846191406, "loss": 0.663, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.17402271926403046, "rewards/margins": 0.06726004183292389, "rewards/rejected": -0.24128277599811554, "step": 5170 }, { "epoch": 3.569951757408684, "grad_norm": 2.0904057025909424, "learning_rate": 1.7403416366400385e-09, "logits/chosen": -2.9234232902526855, "logits/rejected": -2.9129858016967773, "logps/chosen": -71.43891906738281, "logps/rejected": -79.7889404296875, "loss": 0.6649, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.175464928150177, "rewards/margins": 0.06362753361463547, "rewards/rejected": -0.23909242451190948, "step": 5180 }, { "epoch": 3.5768435561681597, "grad_norm": 1.8405756950378418, "learning_rate": 1.6856389168220547e-09, "logits/chosen": -2.8387389183044434, "logits/rejected": -2.8279340267181396, "logps/chosen": -70.47148132324219, "logps/rejected": -75.91267395019531, "loss": 0.6722, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.18103864789009094, "rewards/margins": 0.04757487401366234, "rewards/rejected": -0.22861354053020477, "step": 5190 }, { "epoch": 3.583735354927636, "grad_norm": 1.8923529386520386, "learning_rate": 1.6317796908195985e-09, "logits/chosen": -2.8862547874450684, "logits/rejected": -2.8609628677368164, "logps/chosen": -71.60331726074219, "logps/rejected": -75.79267883300781, "loss": 0.6664, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.1770898550748825, "rewards/margins": 0.059722352772951126, "rewards/rejected": -0.23681220412254333, "step": 5200 }, { "epoch": 3.583735354927636, "eval_logits/chosen": -2.9576752185821533, "eval_logits/rejected": -2.951852321624756, "eval_logps/chosen": -71.32258605957031, "eval_logps/rejected": -78.73802185058594, "eval_loss": 0.680260419845581, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.12610679864883423, "eval_rewards/margins": 0.02947220578789711, "eval_rewards/rejected": -0.15557900071144104, "eval_runtime": 384.5579, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 5200 }, { "epoch": 3.5906271536871124, "grad_norm": 2.064666271209717, "learning_rate": 1.5787659072137944e-09, "logits/chosen": -2.8419549465179443, "logits/rejected": -2.8221335411071777, "logps/chosen": -70.70294189453125, "logps/rejected": -77.23753356933594, "loss": 0.6668, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.17303071916103363, "rewards/margins": 0.0588606521487236, "rewards/rejected": -0.23189136385917664, "step": 5210 }, { "epoch": 3.5975189524465887, "grad_norm": 1.9493430852890015, "learning_rate": 1.5265994839983893e-09, "logits/chosen": -2.8689472675323486, "logits/rejected": -2.835707187652588, "logps/chosen": -72.75984954833984, "logps/rejected": -76.3830795288086, "loss": 0.6644, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17222420871257782, "rewards/margins": 0.06404378265142441, "rewards/rejected": -0.23626796901226044, "step": 5220 }, { "epoch": 3.604410751206065, "grad_norm": 1.8927443027496338, "learning_rate": 1.4752823085103476e-09, "logits/chosen": -2.843079090118408, "logits/rejected": -2.8202366828918457, "logps/chosen": -73.2348861694336, "logps/rejected": -75.82610321044922, "loss": 0.6673, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17618897557258606, "rewards/margins": 0.05725538730621338, "rewards/rejected": -0.23344437777996063, "step": 5230 }, { "epoch": 3.611302549965541, "grad_norm": 2.006829261779785, "learning_rate": 1.4248162373615536e-09, "logits/chosen": -2.8521506786346436, "logits/rejected": -2.8320302963256836, "logps/chosen": -71.88185119628906, "logps/rejected": -77.86662292480469, "loss": 0.6636, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.1736433058977127, "rewards/margins": 0.06516421586275101, "rewards/rejected": -0.2388075292110443, "step": 5240 }, { "epoch": 3.618194348725017, "grad_norm": 1.9538016319274902, "learning_rate": 1.37520309637168e-09, "logits/chosen": -2.839338779449463, "logits/rejected": -2.8204591274261475, "logps/chosen": -71.8875503540039, "logps/rejected": -77.00475311279297, "loss": 0.6661, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.1710798591375351, "rewards/margins": 0.06200499087572098, "rewards/rejected": -0.23308484256267548, "step": 5250 }, { "epoch": 3.6250861474844935, "grad_norm": 1.9293245077133179, "learning_rate": 1.326444680502098e-09, "logits/chosen": -2.8854546546936035, "logits/rejected": -2.863419771194458, "logps/chosen": -72.25894927978516, "logps/rejected": -74.59315490722656, "loss": 0.6693, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.17469267547130585, "rewards/margins": 0.0536469928920269, "rewards/rejected": -0.22833967208862305, "step": 5260 }, { "epoch": 3.6319779462439694, "grad_norm": 2.0885043144226074, "learning_rate": 1.2785427537909481e-09, "logits/chosen": -2.8738741874694824, "logits/rejected": -2.8549060821533203, "logps/chosen": -71.40998077392578, "logps/rejected": -76.88813018798828, "loss": 0.6648, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.17923898994922638, "rewards/margins": 0.06315573304891586, "rewards/rejected": -0.24239471554756165, "step": 5270 }, { "epoch": 3.638869745003446, "grad_norm": 1.9861798286437988, "learning_rate": 1.2314990492893278e-09, "logits/chosen": -2.830810785293579, "logits/rejected": -2.809612512588501, "logps/chosen": -71.61537170410156, "logps/rejected": -76.6853256225586, "loss": 0.6663, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.18130019307136536, "rewards/margins": 0.059529464691877365, "rewards/rejected": -0.24082966148853302, "step": 5280 }, { "epoch": 3.645761543762922, "grad_norm": 2.1104493141174316, "learning_rate": 1.185315268998574e-09, "logits/chosen": -2.865112781524658, "logits/rejected": -2.84001088142395, "logps/chosen": -71.56957244873047, "logps/rejected": -76.92543029785156, "loss": 0.6668, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.18373079597949982, "rewards/margins": 0.059225063771009445, "rewards/rejected": -0.24295584857463837, "step": 5290 }, { "epoch": 3.6526533425223984, "grad_norm": 2.06616473197937, "learning_rate": 1.1399930838086962e-09, "logits/chosen": -2.8636443614959717, "logits/rejected": -2.8466105461120605, "logps/chosen": -71.50733947753906, "logps/rejected": -76.5252914428711, "loss": 0.6687, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.18077199161052704, "rewards/margins": 0.05519663542509079, "rewards/rejected": -0.23596861958503723, "step": 5300 }, { "epoch": 3.6526533425223984, "eval_logits/chosen": -2.9573869705200195, "eval_logits/rejected": -2.951564311981201, "eval_logps/chosen": -71.357177734375, "eval_logps/rejected": -78.77005767822266, "eval_loss": 0.6802834868431091, "eval_rewards/accuracies": 0.6064126491546631, "eval_rewards/chosen": -0.12645280361175537, "eval_rewards/margins": 0.029446497559547424, "eval_rewards/rejected": -0.1558993011713028, "eval_runtime": 384.5252, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 5300 }, { "epoch": 3.6595451412818747, "grad_norm": 1.9328287839889526, "learning_rate": 1.095534133437928e-09, "logits/chosen": -2.8020670413970947, "logits/rejected": -2.7735095024108887, "logps/chosen": -71.3703842163086, "logps/rejected": -75.2965316772461, "loss": 0.6657, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17763185501098633, "rewards/margins": 0.06172214820981026, "rewards/rejected": -0.2393539845943451, "step": 5310 }, { "epoch": 3.6664369400413506, "grad_norm": 2.0308992862701416, "learning_rate": 1.051940026373399e-09, "logits/chosen": -2.908254861831665, "logits/rejected": -2.879467248916626, "logps/chosen": -73.16458129882812, "logps/rejected": -77.05401611328125, "loss": 0.6699, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.1833791881799698, "rewards/margins": 0.05216316133737564, "rewards/rejected": -0.23554232716560364, "step": 5320 }, { "epoch": 3.673328738800827, "grad_norm": 2.064967155456543, "learning_rate": 1.0092123398129343e-09, "logits/chosen": -2.899106502532959, "logits/rejected": -2.8768978118896484, "logps/chosen": -71.95287322998047, "logps/rejected": -75.36768341064453, "loss": 0.6672, "rewards/accuracies": 0.671875, "rewards/chosen": -0.178888738155365, "rewards/margins": 0.057881515473127365, "rewards/rejected": -0.23677024245262146, "step": 5330 }, { "epoch": 3.680220537560303, "grad_norm": 1.9787522554397583, "learning_rate": 9.673526196080029e-10, "logits/chosen": -2.787376880645752, "logits/rejected": -2.7672460079193115, "logps/chosen": -71.5352554321289, "logps/rejected": -76.34475708007812, "loss": 0.664, "rewards/accuracies": 0.671875, "rewards/chosen": -0.17524996399879456, "rewards/margins": 0.06488049030303955, "rewards/rejected": -0.2401304543018341, "step": 5340 }, { "epoch": 3.6871123363197795, "grad_norm": 1.9303803443908691, "learning_rate": 9.263623802078014e-10, "logits/chosen": -2.8560631275177, "logits/rejected": -2.8368136882781982, "logps/chosen": -72.25008392333984, "logps/rejected": -76.45013427734375, "loss": 0.6694, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1830962598323822, "rewards/margins": 0.05351614952087402, "rewards/rejected": -0.23661240935325623, "step": 5350 }, { "epoch": 3.694004135079256, "grad_norm": 1.992553472518921, "learning_rate": 8.862431046044172e-10, "logits/chosen": -2.861959218978882, "logits/rejected": -2.8425135612487793, "logps/chosen": -70.85562133789062, "logps/rejected": -77.3700180053711, "loss": 0.6668, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.17940828204154968, "rewards/margins": 0.058393657207489014, "rewards/rejected": -0.2378019541501999, "step": 5360 }, { "epoch": 3.7008959338387317, "grad_norm": 1.9431288242340088, "learning_rate": 8.469962442792355e-10, "logits/chosen": -2.872873067855835, "logits/rejected": -2.854335308074951, "logps/chosen": -70.85237121582031, "logps/rejected": -77.03694915771484, "loss": 0.6642, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.17005841434001923, "rewards/margins": 0.06496603041887283, "rewards/rejected": -0.23502442240715027, "step": 5370 }, { "epoch": 3.707787732598208, "grad_norm": 2.030609607696533, "learning_rate": 8.086232191503839e-10, "logits/chosen": -2.839414596557617, "logits/rejected": -2.821842908859253, "logps/chosen": -70.74089813232422, "logps/rejected": -77.28166198730469, "loss": 0.6674, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.1744275689125061, "rewards/margins": 0.057841770350933075, "rewards/rejected": -0.23226933181285858, "step": 5380 }, { "epoch": 3.7146795313576844, "grad_norm": 2.1407716274261475, "learning_rate": 7.711254175213705e-10, "logits/chosen": -2.874070882797241, "logits/rejected": -2.862229824066162, "logps/chosen": -72.07778930664062, "logps/rejected": -76.9394302368164, "loss": 0.6697, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.17681041359901428, "rewards/margins": 0.05296726152300835, "rewards/rejected": -0.22977769374847412, "step": 5390 }, { "epoch": 3.7215713301171607, "grad_norm": 2.038729429244995, "learning_rate": 7.345041960308663e-10, "logits/chosen": -2.8218791484832764, "logits/rejected": -2.800405740737915, "logps/chosen": -72.32463073730469, "logps/rejected": -76.3077392578125, "loss": 0.6641, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18442364037036896, "rewards/margins": 0.06546637415885925, "rewards/rejected": -0.24988999962806702, "step": 5400 }, { "epoch": 3.7215713301171607, "eval_logits/chosen": -2.957261323928833, "eval_logits/rejected": -2.9513814449310303, "eval_logps/chosen": -71.3690185546875, "eval_logps/rejected": -78.78221130371094, "eval_loss": 0.6802814602851868, "eval_rewards/accuracies": 0.6059479713439941, "eval_rewards/chosen": -0.12657111883163452, "eval_rewards/margins": 0.02944965474307537, "eval_rewards/rejected": -0.15602077543735504, "eval_runtime": 384.4225, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.4, "step": 5400 }, { "epoch": 3.728463128876637, "grad_norm": 2.151887893676758, "learning_rate": 6.987608796036132e-10, "logits/chosen": -2.869973659515381, "logits/rejected": -2.848824977874756, "logps/chosen": -73.12203216552734, "logps/rejected": -76.91819763183594, "loss": 0.6699, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.18080058693885803, "rewards/margins": 0.05266581103205681, "rewards/rejected": -0.23346638679504395, "step": 5410 }, { "epoch": 3.735354927636113, "grad_norm": 2.0429258346557617, "learning_rate": 6.638967614024937e-10, "logits/chosen": -2.8832645416259766, "logits/rejected": -2.859003782272339, "logps/chosen": -72.4503402709961, "logps/rejected": -77.4783935546875, "loss": 0.6698, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.17588870227336884, "rewards/margins": 0.052649348974227905, "rewards/rejected": -0.22853806614875793, "step": 5420 }, { "epoch": 3.742246726395589, "grad_norm": 2.0684494972229004, "learning_rate": 6.299131027817401e-10, "logits/chosen": -2.84670090675354, "logits/rejected": -2.8295907974243164, "logps/chosen": -72.0188980102539, "logps/rejected": -76.68207550048828, "loss": 0.6678, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.17961344122886658, "rewards/margins": 0.05706768110394478, "rewards/rejected": -0.23668110370635986, "step": 5430 }, { "epoch": 3.7491385251550655, "grad_norm": 2.0891590118408203, "learning_rate": 5.968111332413095e-10, "logits/chosen": -2.901061534881592, "logits/rejected": -2.878356456756592, "logps/chosen": -73.39814758300781, "logps/rejected": -76.63714599609375, "loss": 0.6716, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18039409816265106, "rewards/margins": 0.04889310151338577, "rewards/rejected": -0.22928722202777863, "step": 5440 }, { "epoch": 3.756030323914542, "grad_norm": 1.9887423515319824, "learning_rate": 5.645920503823898e-10, "logits/chosen": -2.868621826171875, "logits/rejected": -2.8548598289489746, "logps/chosen": -72.39013671875, "logps/rejected": -76.55297088623047, "loss": 0.6725, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.1871643364429474, "rewards/margins": 0.047739386558532715, "rewards/rejected": -0.2349037230014801, "step": 5450 }, { "epoch": 3.762922122674018, "grad_norm": 2.184643030166626, "learning_rate": 5.332570198640779e-10, "logits/chosen": -2.867690324783325, "logits/rejected": -2.849740505218506, "logps/chosen": -71.39997863769531, "logps/rejected": -76.14476013183594, "loss": 0.6676, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.17770645022392273, "rewards/margins": 0.0574665442109108, "rewards/rejected": -0.23517298698425293, "step": 5460 }, { "epoch": 3.769813921433494, "grad_norm": 2.0979721546173096, "learning_rate": 5.028071753612167e-10, "logits/chosen": -2.9114272594451904, "logits/rejected": -2.8869986534118652, "logps/chosen": -72.30838012695312, "logps/rejected": -75.88555145263672, "loss": 0.6715, "rewards/accuracies": 0.609375, "rewards/chosen": -0.18485192954540253, "rewards/margins": 0.04934918135404587, "rewards/rejected": -0.234201118350029, "step": 5470 }, { "epoch": 3.7767057201929704, "grad_norm": 2.0394034385681152, "learning_rate": 4.73243618523353e-10, "logits/chosen": -2.8775041103363037, "logits/rejected": -2.8537278175354004, "logps/chosen": -71.85142517089844, "logps/rejected": -74.34036254882812, "loss": 0.6698, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.17356619238853455, "rewards/margins": 0.05245218425989151, "rewards/rejected": -0.22601839900016785, "step": 5480 }, { "epoch": 3.7835975189524467, "grad_norm": 1.90375554561615, "learning_rate": 4.4456741893491023e-10, "logits/chosen": -2.871328830718994, "logits/rejected": -2.8571534156799316, "logps/chosen": -71.54598999023438, "logps/rejected": -77.57771301269531, "loss": 0.6659, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.17719712853431702, "rewards/margins": 0.061613988131284714, "rewards/rejected": -0.23881113529205322, "step": 5490 }, { "epoch": 3.7904893177119225, "grad_norm": 2.0748987197875977, "learning_rate": 4.1677961407647345e-10, "logits/chosen": -2.831299304962158, "logits/rejected": -2.804063558578491, "logps/chosen": -73.7288818359375, "logps/rejected": -77.71610260009766, "loss": 0.6637, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.17103148996829987, "rewards/margins": 0.06553999334573746, "rewards/rejected": -0.2365715056657791, "step": 5500 }, { "epoch": 3.7904893177119225, "eval_logits/chosen": -2.957463264465332, "eval_logits/rejected": -2.951613426208496, "eval_logps/chosen": -71.35787963867188, "eval_logps/rejected": -78.77357482910156, "eval_loss": 0.6802683472633362, "eval_rewards/accuracies": 0.6052509546279907, "eval_rewards/chosen": -0.12645983695983887, "eval_rewards/margins": 0.02947470173239708, "eval_rewards/rejected": -0.15593452751636505, "eval_runtime": 384.7, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.398, "step": 5500 }, { "epoch": 3.7973811164713993, "grad_norm": 1.9732195138931274, "learning_rate": 3.8988120928726274e-10, "logits/chosen": -2.876619815826416, "logits/rejected": -2.857800245285034, "logps/chosen": -72.13102722167969, "logps/rejected": -76.02960205078125, "loss": 0.6651, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16953328251838684, "rewards/margins": 0.06311596184968948, "rewards/rejected": -0.23264923691749573, "step": 5510 }, { "epoch": 3.804272915230875, "grad_norm": 2.0553536415100098, "learning_rate": 3.6387317772875457e-10, "logits/chosen": -2.859802007675171, "logits/rejected": -2.839473009109497, "logps/chosen": -71.93092346191406, "logps/rejected": -78.13179016113281, "loss": 0.6662, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.1747732162475586, "rewards/margins": 0.05951104685664177, "rewards/rejected": -0.23428425192832947, "step": 5520 }, { "epoch": 3.8111647139903515, "grad_norm": 2.0237348079681396, "learning_rate": 3.3875646034947634e-10, "logits/chosen": -2.8678386211395264, "logits/rejected": -2.8496932983398438, "logps/chosen": -72.11505126953125, "logps/rejected": -75.893798828125, "loss": 0.6676, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.17911019921302795, "rewards/margins": 0.05702906847000122, "rewards/rejected": -0.23613925278186798, "step": 5530 }, { "epoch": 3.818056512749828, "grad_norm": 2.043865203857422, "learning_rate": 3.145319658509699e-10, "logits/chosen": -2.847667932510376, "logits/rejected": -2.834940195083618, "logps/chosen": -72.16231536865234, "logps/rejected": -78.20210266113281, "loss": 0.6665, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1757679134607315, "rewards/margins": 0.06015582010149956, "rewards/rejected": -0.23592372238636017, "step": 5540 }, { "epoch": 3.8249483115093037, "grad_norm": 2.0210862159729004, "learning_rate": 2.9120057065490365e-10, "logits/chosen": -2.844801664352417, "logits/rejected": -2.8246302604675293, "logps/chosen": -72.60627746582031, "logps/rejected": -75.52421569824219, "loss": 0.6722, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.18030142784118652, "rewards/margins": 0.04800591617822647, "rewards/rejected": -0.2283073365688324, "step": 5550 }, { "epoch": 3.83184011026878, "grad_norm": 1.9486416578292847, "learning_rate": 2.687631188713735e-10, "logits/chosen": -2.8291945457458496, "logits/rejected": -2.810871124267578, "logps/chosen": -71.43711853027344, "logps/rejected": -76.3400650024414, "loss": 0.6694, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.1751783788204193, "rewards/margins": 0.053191863000392914, "rewards/rejected": -0.22837023437023163, "step": 5560 }, { "epoch": 3.8387319090282563, "grad_norm": 2.05519700050354, "learning_rate": 2.4722042226835993e-10, "logits/chosen": -2.837230682373047, "logits/rejected": -2.8224728107452393, "logps/chosen": -70.8357925415039, "logps/rejected": -76.91483306884766, "loss": 0.6664, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17754293978214264, "rewards/margins": 0.05984468385577202, "rewards/rejected": -0.23738762736320496, "step": 5570 }, { "epoch": 3.8456237077877327, "grad_norm": 1.878474473953247, "learning_rate": 2.2657326024235755e-10, "logits/chosen": -2.8552939891815186, "logits/rejected": -2.8431477546691895, "logps/chosen": -70.79942321777344, "logps/rejected": -77.08605194091797, "loss": 0.6692, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.1805018037557602, "rewards/margins": 0.05370640754699707, "rewards/rejected": -0.23420822620391846, "step": 5580 }, { "epoch": 3.852515506547209, "grad_norm": 1.997184157371521, "learning_rate": 2.0682237979018636e-10, "logits/chosen": -2.8794636726379395, "logits/rejected": -2.8509156703948975, "logps/chosen": -73.82582092285156, "logps/rejected": -75.158935546875, "loss": 0.6663, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.18050740659236908, "rewards/margins": 0.06038772314786911, "rewards/rejected": -0.2408951222896576, "step": 5590 }, { "epoch": 3.859407305306685, "grad_norm": 2.0682809352874756, "learning_rate": 1.8796849548195215e-10, "logits/chosen": -2.826308250427246, "logits/rejected": -2.808901309967041, "logps/chosen": -72.65235900878906, "logps/rejected": -76.69111633300781, "loss": 0.6694, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.17561136186122894, "rewards/margins": 0.05344085767865181, "rewards/rejected": -0.22905221581459045, "step": 5600 }, { "epoch": 3.859407305306685, "eval_logits/chosen": -2.9573676586151123, "eval_logits/rejected": -2.951545238494873, "eval_logps/chosen": -71.36114501953125, "eval_logps/rejected": -78.78689575195312, "eval_loss": 0.6802201867103577, "eval_rewards/accuracies": 0.6036245226860046, "eval_rewards/chosen": -0.1264924854040146, "eval_rewards/margins": 0.029575219377875328, "eval_rewards/rejected": -0.15606769919395447, "eval_runtime": 384.6647, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 5600 }, { "epoch": 3.866299104066161, "grad_norm": 2.052184581756592, "learning_rate": 1.7001228943520075e-10, "logits/chosen": -2.8588435649871826, "logits/rejected": -2.833009719848633, "logps/chosen": -74.13676452636719, "logps/rejected": -75.88396453857422, "loss": 0.6712, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.17457132041454315, "rewards/margins": 0.04955599457025528, "rewards/rejected": -0.22412733733654022, "step": 5610 }, { "epoch": 3.8731909028256375, "grad_norm": 2.171339750289917, "learning_rate": 1.5295441129024312e-10, "logits/chosen": -2.848580837249756, "logits/rejected": -2.835331439971924, "logps/chosen": -71.81614685058594, "logps/rejected": -77.56596374511719, "loss": 0.6698, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.18499977886676788, "rewards/margins": 0.05274176597595215, "rewards/rejected": -0.23774154484272003, "step": 5620 }, { "epoch": 3.880082701585114, "grad_norm": 1.9862202405929565, "learning_rate": 1.3679547818664927e-10, "logits/chosen": -2.9051053524017334, "logits/rejected": -2.8834900856018066, "logps/chosen": -72.98822784423828, "logps/rejected": -77.17180633544922, "loss": 0.6686, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.176753968000412, "rewards/margins": 0.05489393323659897, "rewards/rejected": -0.23164793848991394, "step": 5630 }, { "epoch": 3.88697450034459, "grad_norm": 1.976684331893921, "learning_rate": 1.2153607474091332e-10, "logits/chosen": -2.859466552734375, "logits/rejected": -2.832249402999878, "logps/chosen": -71.03312683105469, "logps/rejected": -76.01225280761719, "loss": 0.6647, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.1708461344242096, "rewards/margins": 0.06328844279050827, "rewards/rejected": -0.23413459956645966, "step": 5640 }, { "epoch": 3.893866299104066, "grad_norm": 2.033540725708008, "learning_rate": 1.0717675302531482e-10, "logits/chosen": -2.888030529022217, "logits/rejected": -2.867461681365967, "logps/chosen": -72.51081848144531, "logps/rejected": -77.79974365234375, "loss": 0.6652, "rewards/accuracies": 0.671875, "rewards/chosen": -0.18291905522346497, "rewards/margins": 0.062029384076595306, "rewards/rejected": -0.24494843184947968, "step": 5650 }, { "epoch": 3.9007580978635423, "grad_norm": 1.9646199941635132, "learning_rate": 9.371803254794308e-11, "logits/chosen": -2.852908134460449, "logits/rejected": -2.8348288536071777, "logps/chosen": -72.42581939697266, "logps/rejected": -76.13785552978516, "loss": 0.6727, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1831497997045517, "rewards/margins": 0.04659117013216019, "rewards/rejected": -0.22974097728729248, "step": 5660 }, { "epoch": 3.9076498966230186, "grad_norm": 1.9325337409973145, "learning_rate": 8.116040023388448e-11, "logits/chosen": -2.848618984222412, "logits/rejected": -2.826350688934326, "logps/chosen": -72.94657897949219, "logps/rejected": -77.83454895019531, "loss": 0.6641, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1744307577610016, "rewards/margins": 0.06405071169137955, "rewards/rejected": -0.23848147690296173, "step": 5670 }, { "epoch": 3.914541695382495, "grad_norm": 2.021181583404541, "learning_rate": 6.950431040763371e-11, "logits/chosen": -2.8645360469818115, "logits/rejected": -2.847898483276367, "logps/chosen": -70.85758209228516, "logps/rejected": -77.56425476074219, "loss": 0.6687, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.18004465103149414, "rewards/margins": 0.0549914613366127, "rewards/rejected": -0.23503610491752625, "step": 5680 }, { "epoch": 3.9214334941419713, "grad_norm": 1.918362021446228, "learning_rate": 5.875018477663752e-11, "logits/chosen": -2.8878121376037598, "logits/rejected": -2.861017942428589, "logps/chosen": -72.43756866455078, "logps/rejected": -76.90264129638672, "loss": 0.6647, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.17562000453472137, "rewards/margins": 0.06314704567193985, "rewards/rejected": -0.23876705765724182, "step": 5690 }, { "epoch": 3.928325292901447, "grad_norm": 2.1007118225097656, "learning_rate": 4.8898412416040203e-11, "logits/chosen": -2.9333157539367676, "logits/rejected": -2.9187569618225098, "logps/chosen": -72.62559509277344, "logps/rejected": -79.22654724121094, "loss": 0.6684, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.17988933622837067, "rewards/margins": 0.05617032200098038, "rewards/rejected": -0.23605963587760925, "step": 5700 }, { "epoch": 3.928325292901447, "eval_logits/chosen": -2.95705509185791, "eval_logits/rejected": -2.951220750808716, "eval_logps/chosen": -71.3707275390625, "eval_logps/rejected": -78.7791976928711, "eval_loss": 0.680305004119873, "eval_rewards/accuracies": 0.6071096658706665, "eval_rewards/chosen": -0.126588374376297, "eval_rewards/margins": 0.029402369633316994, "eval_rewards/rejected": -0.15599074959754944, "eval_runtime": 384.8774, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 5700 }, { "epoch": 3.9352170916609235, "grad_norm": 2.1666312217712402, "learning_rate": 3.994934975461439e-11, "logits/chosen": -2.8247122764587402, "logits/rejected": -2.8089518547058105, "logps/chosen": -70.52175903320312, "logps/rejected": -76.8604507446289, "loss": 0.667, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.17770597338676453, "rewards/margins": 0.058614857494831085, "rewards/rejected": -0.23632082343101501, "step": 5710 }, { "epoch": 3.9421088904204, "grad_norm": 1.851887583732605, "learning_rate": 3.190332056186018e-11, "logits/chosen": -2.841176748275757, "logits/rejected": -2.8267760276794434, "logps/chosen": -70.0234146118164, "logps/rejected": -74.63334655761719, "loss": 0.671, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.1831142008304596, "rewards/margins": 0.04964384809136391, "rewards/rejected": -0.23275801539421082, "step": 5720 }, { "epoch": 3.9490006891798757, "grad_norm": 2.0131776332855225, "learning_rate": 2.4760615936289532e-11, "logits/chosen": -2.7864444255828857, "logits/rejected": -2.762784719467163, "logps/chosen": -73.19425201416016, "logps/rejected": -76.13724517822266, "loss": 0.6686, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.17625829577445984, "rewards/margins": 0.055087197571992874, "rewards/rejected": -0.23134548962116241, "step": 5730 }, { "epoch": 3.9558924879393524, "grad_norm": 2.176494836807251, "learning_rate": 1.8521494294898578e-11, "logits/chosen": -2.828831434249878, "logits/rejected": -2.7964935302734375, "logps/chosen": -71.51968383789062, "logps/rejected": -76.20378112792969, "loss": 0.6642, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.1779634952545166, "rewards/margins": 0.06530503183603287, "rewards/rejected": -0.24326848983764648, "step": 5740 }, { "epoch": 3.9627842866988283, "grad_norm": 1.9711962938308716, "learning_rate": 1.318618136381955e-11, "logits/chosen": -2.8637359142303467, "logits/rejected": -2.8356573581695557, "logps/chosen": -72.66876220703125, "logps/rejected": -76.87183380126953, "loss": 0.6671, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18348637223243713, "rewards/margins": 0.058950237929821014, "rewards/rejected": -0.24243660271167755, "step": 5750 }, { "epoch": 3.9696760854583046, "grad_norm": 2.0854268074035645, "learning_rate": 8.75487017014953e-12, "logits/chosen": -2.8128905296325684, "logits/rejected": -2.7940261363983154, "logps/chosen": -70.58373260498047, "logps/rejected": -77.01518249511719, "loss": 0.6645, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.17402105033397675, "rewards/margins": 0.06374243646860123, "rewards/rejected": -0.23776349425315857, "step": 5760 }, { "epoch": 3.976567884217781, "grad_norm": 2.2225143909454346, "learning_rate": 5.227721034969934e-12, "logits/chosen": -2.8765082359313965, "logits/rejected": -2.8556480407714844, "logps/chosen": -71.49765014648438, "logps/rejected": -77.3117446899414, "loss": 0.6627, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.16837500035762787, "rewards/margins": 0.06768227368593216, "rewards/rejected": -0.23605728149414062, "step": 5770 }, { "epoch": 3.983459682977257, "grad_norm": 2.0012478828430176, "learning_rate": 2.6048615675483555e-12, "logits/chosen": -2.9059433937072754, "logits/rejected": -2.8787879943847656, "logps/chosen": -72.49078369140625, "logps/rejected": -77.75, "loss": 0.6626, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17644812166690826, "rewards/margins": 0.06763257086277008, "rewards/rejected": -0.24408069252967834, "step": 5780 }, { "epoch": 3.990351481736733, "grad_norm": 2.116518259048462, "learning_rate": 8.863866607144999e-13, "logits/chosen": -2.8727505207061768, "logits/rejected": -2.839143753051758, "logps/chosen": -72.63795471191406, "logps/rejected": -75.61775207519531, "loss": 0.6608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17059843242168427, "rewards/margins": 0.07168037444353104, "rewards/rejected": -0.2422788143157959, "step": 5790 }, { "epoch": 3.9972432804962095, "grad_norm": 1.921798586845398, "learning_rate": 7.235848743236683e-14, "logits/chosen": -2.86662220954895, "logits/rejected": -2.8429157733917236, "logps/chosen": -72.2627182006836, "logps/rejected": -76.62482452392578, "loss": 0.6668, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.1750541776418686, "rewards/margins": 0.05863633751869202, "rewards/rejected": -0.2336905300617218, "step": 5800 }, { "epoch": 3.9972432804962095, "eval_logits/chosen": -2.9570438861846924, "eval_logits/rejected": -2.951200246810913, "eval_logps/chosen": -71.36335754394531, "eval_logps/rejected": -78.77714538574219, "eval_loss": 0.6802749633789062, "eval_rewards/accuracies": 0.6036245226860046, "eval_rewards/chosen": -0.12651462852954865, "eval_rewards/margins": 0.02945556864142418, "eval_rewards/rejected": -0.15597018599510193, "eval_runtime": 384.6527, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 5800 }, { "epoch": 4.0, "step": 5804, "total_flos": 0.0, "train_loss": 0.6773310824150385, "train_runtime": 111666.1731, "train_samples_per_second": 3.326, "train_steps_per_second": 0.052 } ], "logging_steps": 10, "max_steps": 5804, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }