diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6395 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026171159382360636, + "grad_norm": 2.427435874938965, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": -2.452890634536743, + "logits/rejected": -2.3576245307922363, + "logps/chosen": -290.49053955078125, + "logps/rejected": -374.69940185546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0026171159382360636, + "grad_norm": 2.4065892696380615, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.280916452407837, + "logits/rejected": -2.18080735206604, + "logps/chosen": -279.5721435546875, + "logps/rejected": -245.38124084472656, + "loss": 0.6931, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": 0.0002959521661978215, + "rewards/margins": 4.458064722712152e-05, + "rewards/rejected": 0.0002513715880922973, + "step": 10 + }, + { + "epoch": 0.005234231876472127, + "grad_norm": 2.543537139892578, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.286400318145752, + "logits/rejected": -2.1322734355926514, + "logps/chosen": -305.47900390625, + "logps/rejected": -237.6411895751953, + "loss": 0.6926, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0033905524760484695, + "rewards/margins": 0.0010894734878093004, + "rewards/rejected": 0.002301078988239169, + "step": 20 + }, + { + "epoch": 0.007851347814708191, + "grad_norm": 2.317607879638672, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": -2.2721304893493652, + "logits/rejected": -2.2249627113342285, + "logps/chosen": -251.0873565673828, + "logps/rejected": -251.26864624023438, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012264861725270748, + "rewards/margins": 0.0016630779718980193, + "rewards/rejected": 0.010601785033941269, + "step": 30 + }, + { + "epoch": 0.010468463752944255, + "grad_norm": 1.9544142484664917, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": -2.1681597232818604, + "logits/rejected": -2.1325502395629883, + "logps/chosen": -216.1050262451172, + "logps/rejected": -221.6034698486328, + "loss": 0.6911, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.019591109827160835, + "rewards/margins": 0.00413005193695426, + "rewards/rejected": 0.015461057424545288, + "step": 40 + }, + { + "epoch": 0.01308557969118032, + "grad_norm": 2.0900888442993164, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": -2.2135119438171387, + "logits/rejected": -2.1745445728302, + "logps/chosen": -266.76007080078125, + "logps/rejected": -234.2284698486328, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02980896458029747, + "rewards/margins": 0.005023510195314884, + "rewards/rejected": 0.02478545531630516, + "step": 50 + }, + { + "epoch": 0.015702695629416383, + "grad_norm": 2.1390092372894287, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": -2.1692872047424316, + "logits/rejected": -2.1056342124938965, + "logps/chosen": -252.186767578125, + "logps/rejected": -226.5349884033203, + "loss": 0.6901, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.03267771750688553, + "rewards/margins": 0.0062465183436870575, + "rewards/rejected": 0.026431197300553322, + "step": 60 + }, + { + "epoch": 0.018319811567652448, + "grad_norm": 2.0599091053009033, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": -2.309943675994873, + "logits/rejected": -2.187107563018799, + "logps/chosen": -271.86541748046875, + "logps/rejected": -246.50680541992188, + "loss": 0.6877, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.04242750257253647, + "rewards/margins": 0.011090461164712906, + "rewards/rejected": 0.031337037682533264, + "step": 70 + }, + { + "epoch": 0.02093692750588851, + "grad_norm": 2.3880298137664795, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": -2.2041609287261963, + "logits/rejected": -2.1138315200805664, + "logps/chosen": -257.4185485839844, + "logps/rejected": -246.7639923095703, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03884550929069519, + "rewards/margins": 0.011877561919391155, + "rewards/rejected": 0.02696794643998146, + "step": 80 + }, + { + "epoch": 0.023554043444124574, + "grad_norm": 2.3031389713287354, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": -2.208482265472412, + "logits/rejected": -2.1343834400177, + "logps/chosen": -249.96255493164062, + "logps/rejected": -234.4242706298828, + "loss": 0.684, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.043103523552417755, + "rewards/margins": 0.018813790753483772, + "rewards/rejected": 0.024289730936288834, + "step": 90 + }, + { + "epoch": 0.02617115938236064, + "grad_norm": 2.119929075241089, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": -2.2504467964172363, + "logits/rejected": -2.178734540939331, + "logps/chosen": -246.7833251953125, + "logps/rejected": -230.8575897216797, + "loss": 0.6807, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04890027642250061, + "rewards/margins": 0.025924110785126686, + "rewards/rejected": 0.022976163774728775, + "step": 100 + }, + { + "epoch": 0.02617115938236064, + "eval_logits/chosen": -2.1481568813323975, + "eval_logits/rejected": -2.055117607116699, + "eval_logps/chosen": -259.46044921875, + "eval_logps/rejected": -242.01309204101562, + "eval_loss": 0.6808694005012512, + "eval_rewards/accuracies": 0.6554999947547913, + "eval_rewards/chosen": 0.05141494795680046, + "eval_rewards/margins": 0.025842413306236267, + "eval_rewards/rejected": 0.025572534650564194, + "eval_runtime": 1599.8543, + "eval_samples_per_second": 1.25, + "eval_steps_per_second": 0.156, + "step": 100 + }, + { + "epoch": 0.028788275320596704, + "grad_norm": 2.4198131561279297, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": -2.2423720359802246, + "logits/rejected": -2.1254634857177734, + "logps/chosen": -284.2754821777344, + "logps/rejected": -239.1751251220703, + "loss": 0.677, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.050642453134059906, + "rewards/margins": 0.03400001674890518, + "rewards/rejected": 0.016642430797219276, + "step": 110 + }, + { + "epoch": 0.031405391258832765, + "grad_norm": 2.272566556930542, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": -2.273714303970337, + "logits/rejected": -2.160338878631592, + "logps/chosen": -287.36285400390625, + "logps/rejected": -272.5426025390625, + "loss": 0.6696, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.05583573505282402, + "rewards/margins": 0.04974224418401718, + "rewards/rejected": 0.006093493662774563, + "step": 120 + }, + { + "epoch": 0.03402250719706883, + "grad_norm": 2.827535390853882, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": -2.2895429134368896, + "logits/rejected": -2.1921463012695312, + "logps/chosen": -250.36807250976562, + "logps/rejected": -254.2834930419922, + "loss": 0.664, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.04658503085374832, + "rewards/margins": 0.06275991350412369, + "rewards/rejected": -0.01617487706243992, + "step": 130 + }, + { + "epoch": 0.036639623135304895, + "grad_norm": 2.8360142707824707, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": -2.285165309906006, + "logits/rejected": -2.075157880783081, + "logps/chosen": -272.5437927246094, + "logps/rejected": -229.80880737304688, + "loss": 0.662, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.021640608087182045, + "rewards/margins": 0.06897087395191193, + "rewards/rejected": -0.047330256551504135, + "step": 140 + }, + { + "epoch": 0.03925673907354096, + "grad_norm": 3.0254032611846924, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": -2.289304494857788, + "logits/rejected": -2.16825532913208, + "logps/chosen": -283.7846984863281, + "logps/rejected": -248.1438446044922, + "loss": 0.6606, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.0013313032686710358, + "rewards/margins": 0.07430683076381683, + "rewards/rejected": -0.0729755312204361, + "step": 150 + }, + { + "epoch": 0.04187385501177702, + "grad_norm": 3.2089273929595947, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": -2.209859609603882, + "logits/rejected": -2.1506431102752686, + "logps/chosen": -262.52569580078125, + "logps/rejected": -270.04766845703125, + "loss": 0.6632, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.05757613852620125, + "rewards/margins": 0.07334659993648529, + "rewards/rejected": -0.13092274963855743, + "step": 160 + }, + { + "epoch": 0.04449097095001309, + "grad_norm": 3.7007648944854736, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": -2.190873384475708, + "logits/rejected": -2.1105270385742188, + "logps/chosen": -227.3632049560547, + "logps/rejected": -236.8821563720703, + "loss": 0.6635, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0691681057214737, + "rewards/margins": 0.0719941109418869, + "rewards/rejected": -0.1411622166633606, + "step": 170 + }, + { + "epoch": 0.04710808688824915, + "grad_norm": 5.2089338302612305, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": -2.2085936069488525, + "logits/rejected": -2.1274473667144775, + "logps/chosen": -273.2780456542969, + "logps/rejected": -269.22747802734375, + "loss": 0.6559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17684438824653625, + "rewards/margins": 0.0919983834028244, + "rewards/rejected": -0.26884278655052185, + "step": 180 + }, + { + "epoch": 0.04972520282648522, + "grad_norm": 4.031327724456787, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": -2.2836358547210693, + "logits/rejected": -2.1660006046295166, + "logps/chosen": -281.2835388183594, + "logps/rejected": -266.60821533203125, + "loss": 0.637, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.11275825649499893, + "rewards/margins": 0.13688938319683075, + "rewards/rejected": -0.24964764714241028, + "step": 190 + }, + { + "epoch": 0.05234231876472128, + "grad_norm": 6.425544261932373, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": -2.2068774700164795, + "logits/rejected": -2.0849859714508057, + "logps/chosen": -266.81500244140625, + "logps/rejected": -241.287353515625, + "loss": 0.6438, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.1718008667230606, + "rewards/margins": 0.12619325518608093, + "rewards/rejected": -0.29799407720565796, + "step": 200 + }, + { + "epoch": 0.05234231876472128, + "eval_logits/chosen": -2.0999979972839355, + "eval_logits/rejected": -2.011294364929199, + "eval_logps/chosen": -283.4154357910156, + "eval_logps/rejected": -278.46148681640625, + "eval_loss": 0.6356053948402405, + "eval_rewards/accuracies": 0.6759999990463257, + "eval_rewards/chosen": -0.18813487887382507, + "eval_rewards/margins": 0.15077635645866394, + "eval_rewards/rejected": -0.3389112055301666, + "eval_runtime": 1598.5625, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 200 + }, + { + "epoch": 0.05495943470295734, + "grad_norm": 4.150570869445801, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": -2.255913734436035, + "logits/rejected": -2.120842456817627, + "logps/chosen": -280.205810546875, + "logps/rejected": -268.5466613769531, + "loss": 0.6131, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18164019286632538, + "rewards/margins": 0.19993841648101807, + "rewards/rejected": -0.38157862424850464, + "step": 210 + }, + { + "epoch": 0.05757655064119341, + "grad_norm": 4.034811496734619, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": -2.1413655281066895, + "logits/rejected": -2.093209743499756, + "logps/chosen": -298.56549072265625, + "logps/rejected": -289.4757995605469, + "loss": 0.6254, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4465310573577881, + "rewards/margins": 0.17338070273399353, + "rewards/rejected": -0.619911789894104, + "step": 220 + }, + { + "epoch": 0.06019366657942947, + "grad_norm": 6.063634395599365, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": -2.266742706298828, + "logits/rejected": -2.164170742034912, + "logps/chosen": -352.3594665527344, + "logps/rejected": -333.99053955078125, + "loss": 0.6479, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.46258726716041565, + "rewards/margins": 0.17155149579048157, + "rewards/rejected": -0.6341387033462524, + "step": 230 + }, + { + "epoch": 0.06281078251766553, + "grad_norm": 5.352989673614502, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": -2.136970281600952, + "logits/rejected": -2.0645029544830322, + "logps/chosen": -342.98870849609375, + "logps/rejected": -350.2208251953125, + "loss": 0.6237, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4678193926811218, + "rewards/margins": 0.2228115350008011, + "rewards/rejected": -0.6906309127807617, + "step": 240 + }, + { + "epoch": 0.06542789845590159, + "grad_norm": 6.087672233581543, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": -2.162543296813965, + "logits/rejected": -2.1394383907318115, + "logps/chosen": -298.310302734375, + "logps/rejected": -299.23260498046875, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3882629871368408, + "rewards/margins": 0.26374003291130066, + "rewards/rejected": -0.6520029902458191, + "step": 250 + }, + { + "epoch": 0.06804501439413765, + "grad_norm": 9.075284004211426, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": -2.2069649696350098, + "logits/rejected": -2.0832362174987793, + "logps/chosen": -315.36358642578125, + "logps/rejected": -315.8193664550781, + "loss": 0.6231, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.43095770478248596, + "rewards/margins": 0.2242399901151657, + "rewards/rejected": -0.6551976203918457, + "step": 260 + }, + { + "epoch": 0.07066213033237373, + "grad_norm": 4.0352067947387695, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": -2.1507174968719482, + "logits/rejected": -2.084745407104492, + "logps/chosen": -323.89361572265625, + "logps/rejected": -321.73907470703125, + "loss": 0.5962, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6326006650924683, + "rewards/margins": 0.293195515871048, + "rewards/rejected": -0.9257962107658386, + "step": 270 + }, + { + "epoch": 0.07327924627060979, + "grad_norm": 6.033257007598877, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": -2.17421293258667, + "logits/rejected": -2.054452896118164, + "logps/chosen": -340.52557373046875, + "logps/rejected": -340.84259033203125, + "loss": 0.627, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8170977830886841, + "rewards/margins": 0.2173783779144287, + "rewards/rejected": -1.0344761610031128, + "step": 280 + }, + { + "epoch": 0.07589636220884585, + "grad_norm": 6.212845325469971, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": -2.1532936096191406, + "logits/rejected": -2.0849764347076416, + "logps/chosen": -353.55975341796875, + "logps/rejected": -354.45782470703125, + "loss": 0.6139, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6797887086868286, + "rewards/margins": 0.26041343808174133, + "rewards/rejected": -0.9402019381523132, + "step": 290 + }, + { + "epoch": 0.07851347814708191, + "grad_norm": 9.043365478515625, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": -2.1526737213134766, + "logits/rejected": -2.0208210945129395, + "logps/chosen": -306.4209899902344, + "logps/rejected": -320.6431884765625, + "loss": 0.6073, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6231580972671509, + "rewards/margins": 0.2800864577293396, + "rewards/rejected": -0.9032446146011353, + "step": 300 + }, + { + "epoch": 0.07851347814708191, + "eval_logits/chosen": -2.0781641006469727, + "eval_logits/rejected": -1.9948630332946777, + "eval_logps/chosen": -333.25830078125, + "eval_logps/rejected": -342.0091247558594, + "eval_loss": 0.6053693890571594, + "eval_rewards/accuracies": 0.6815000176429749, + "eval_rewards/chosen": -0.6865635514259338, + "eval_rewards/margins": 0.2878238558769226, + "eval_rewards/rejected": -0.9743873476982117, + "eval_runtime": 1598.3515, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 300 + }, + { + "epoch": 0.08113059408531798, + "grad_norm": 8.154093742370605, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": -2.2768380641937256, + "logits/rejected": -2.16288161277771, + "logps/chosen": -355.626953125, + "logps/rejected": -341.3152770996094, + "loss": 0.5637, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6677303910255432, + "rewards/margins": 0.39315730333328247, + "rewards/rejected": -1.0608876943588257, + "step": 310 + }, + { + "epoch": 0.08374771002355404, + "grad_norm": 12.870465278625488, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": -2.2009758949279785, + "logits/rejected": -2.101364850997925, + "logps/chosen": -330.45904541015625, + "logps/rejected": -343.94305419921875, + "loss": 0.6039, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7225368022918701, + "rewards/margins": 0.30548617243766785, + "rewards/rejected": -1.0280230045318604, + "step": 320 + }, + { + "epoch": 0.08636482596179011, + "grad_norm": 5.590673923492432, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": -2.086026668548584, + "logits/rejected": -2.0480690002441406, + "logps/chosen": -313.5097351074219, + "logps/rejected": -321.5640869140625, + "loss": 0.5895, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5396815538406372, + "rewards/margins": 0.322670042514801, + "rewards/rejected": -0.8623515963554382, + "step": 330 + }, + { + "epoch": 0.08898194190002617, + "grad_norm": 7.664637088775635, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": -2.1061110496520996, + "logits/rejected": -2.049543857574463, + "logps/chosen": -358.71868896484375, + "logps/rejected": -383.11328125, + "loss": 0.5694, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6914780735969543, + "rewards/margins": 0.45771294832229614, + "rewards/rejected": -1.1491910219192505, + "step": 340 + }, + { + "epoch": 0.09159905783826224, + "grad_norm": 10.34107780456543, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": -1.9783916473388672, + "logits/rejected": -1.901391625404358, + "logps/chosen": -401.55120849609375, + "logps/rejected": -426.84832763671875, + "loss": 0.6179, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1243749856948853, + "rewards/margins": 0.3678717613220215, + "rewards/rejected": -1.4922468662261963, + "step": 350 + }, + { + "epoch": 0.0942161737764983, + "grad_norm": 6.533565044403076, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": -1.9793760776519775, + "logits/rejected": -1.922286033630371, + "logps/chosen": -367.68817138671875, + "logps/rejected": -385.93798828125, + "loss": 0.5861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2674810886383057, + "rewards/margins": 0.39134687185287476, + "rewards/rejected": -1.6588280200958252, + "step": 360 + }, + { + "epoch": 0.09683328971473436, + "grad_norm": 9.993318557739258, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": -1.9993594884872437, + "logits/rejected": -1.8775148391723633, + "logps/chosen": -397.10125732421875, + "logps/rejected": -394.6053771972656, + "loss": 0.5852, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2026114463806152, + "rewards/margins": 0.4132401943206787, + "rewards/rejected": -1.615851640701294, + "step": 370 + }, + { + "epoch": 0.09945040565297043, + "grad_norm": 8.581938743591309, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": -1.9525811672210693, + "logits/rejected": -1.7932708263397217, + "logps/chosen": -388.2157287597656, + "logps/rejected": -399.0534973144531, + "loss": 0.5682, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9025002717971802, + "rewards/margins": 0.44248518347740173, + "rewards/rejected": -1.3449854850769043, + "step": 380 + }, + { + "epoch": 0.1020675215912065, + "grad_norm": 17.113487243652344, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": -1.920275330543518, + "logits/rejected": -1.8368419408798218, + "logps/chosen": -342.7761535644531, + "logps/rejected": -365.7699279785156, + "loss": 0.5702, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6181488633155823, + "rewards/margins": 0.4308691620826721, + "rewards/rejected": -1.0490180253982544, + "step": 390 + }, + { + "epoch": 0.10468463752944256, + "grad_norm": 18.049278259277344, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": -1.844530463218689, + "logits/rejected": -1.724854826927185, + "logps/chosen": -372.54058837890625, + "logps/rejected": -370.39080810546875, + "loss": 0.5956, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0718820095062256, + "rewards/margins": 0.3849504590034485, + "rewards/rejected": -1.4568325281143188, + "step": 400 + }, + { + "epoch": 0.10468463752944256, + "eval_logits/chosen": -1.6757981777191162, + "eval_logits/rejected": -1.5843830108642578, + "eval_logps/chosen": -409.4522399902344, + "eval_logps/rejected": -440.5653381347656, + "eval_loss": 0.5824012160301208, + "eval_rewards/accuracies": 0.6830000281333923, + "eval_rewards/chosen": -1.448502540588379, + "eval_rewards/margins": 0.5114473700523376, + "eval_rewards/rejected": -1.9599499702453613, + "eval_runtime": 1596.5137, + "eval_samples_per_second": 1.253, + "eval_steps_per_second": 0.157, + "step": 400 + }, + { + "epoch": 0.10730175346767862, + "grad_norm": 7.473143577575684, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": -1.7772619724273682, + "logits/rejected": -1.71932053565979, + "logps/chosen": -362.2901306152344, + "logps/rejected": -392.97271728515625, + "loss": 0.6257, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2004709243774414, + "rewards/margins": 0.3244817852973938, + "rewards/rejected": -1.5249526500701904, + "step": 410 + }, + { + "epoch": 0.10991886940591468, + "grad_norm": 5.672206401824951, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": -1.798180341720581, + "logits/rejected": -1.6758928298950195, + "logps/chosen": -342.5751037597656, + "logps/rejected": -385.69757080078125, + "loss": 0.5303, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1391090154647827, + "rewards/margins": 0.5212605595588684, + "rewards/rejected": -1.660369634628296, + "step": 420 + }, + { + "epoch": 0.11253598534415074, + "grad_norm": 7.920849800109863, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": -1.839582085609436, + "logits/rejected": -1.7662830352783203, + "logps/chosen": -396.0287170410156, + "logps/rejected": -421.93707275390625, + "loss": 0.542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1852526664733887, + "rewards/margins": 0.49562257528305054, + "rewards/rejected": -1.6808754205703735, + "step": 430 + }, + { + "epoch": 0.11515310128238682, + "grad_norm": 9.021227836608887, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": -1.9095804691314697, + "logits/rejected": -1.8493105173110962, + "logps/chosen": -372.8834533691406, + "logps/rejected": -393.52532958984375, + "loss": 0.575, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0081276893615723, + "rewards/margins": 0.4901418089866638, + "rewards/rejected": -1.4982694387435913, + "step": 440 + }, + { + "epoch": 0.11777021722062288, + "grad_norm": 8.462127685546875, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": -1.7473382949829102, + "logits/rejected": -1.6900889873504639, + "logps/chosen": -388.23175048828125, + "logps/rejected": -426.498046875, + "loss": 0.5359, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2048237323760986, + "rewards/margins": 0.612975537776947, + "rewards/rejected": -1.8177993297576904, + "step": 450 + }, + { + "epoch": 0.12038733315885894, + "grad_norm": 12.204924583435059, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": -1.7278436422348022, + "logits/rejected": -1.6576976776123047, + "logps/chosen": -472.71600341796875, + "logps/rejected": -530.8753662109375, + "loss": 0.5586, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0762267112731934, + "rewards/margins": 0.7278280258178711, + "rewards/rejected": -2.8040547370910645, + "step": 460 + }, + { + "epoch": 0.123004449097095, + "grad_norm": 6.473967552185059, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": -1.8171007633209229, + "logits/rejected": -1.712386131286621, + "logps/chosen": -393.3753662109375, + "logps/rejected": -441.9925231933594, + "loss": 0.5643, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4961185455322266, + "rewards/margins": 0.6115677356719971, + "rewards/rejected": -2.1076862812042236, + "step": 470 + }, + { + "epoch": 0.12562156503533106, + "grad_norm": 5.5269598960876465, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": -1.9438987970352173, + "logits/rejected": -1.8433564901351929, + "logps/chosen": -341.74261474609375, + "logps/rejected": -403.5177917480469, + "loss": 0.5323, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8496583104133606, + "rewards/margins": 0.5994860529899597, + "rewards/rejected": -1.4491443634033203, + "step": 480 + }, + { + "epoch": 0.12823868097356714, + "grad_norm": 9.541817665100098, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": -1.9532935619354248, + "logits/rejected": -1.8457939624786377, + "logps/chosen": -374.6437072753906, + "logps/rejected": -403.85614013671875, + "loss": 0.5377, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8984916806221008, + "rewards/margins": 0.5953295826911926, + "rewards/rejected": -1.493821144104004, + "step": 490 + }, + { + "epoch": 0.13085579691180318, + "grad_norm": 13.012099266052246, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": -1.8454961776733398, + "logits/rejected": -1.7134668827056885, + "logps/chosen": -394.9266052246094, + "logps/rejected": -415.48138427734375, + "loss": 0.5643, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2125742435455322, + "rewards/margins": 0.6179044842720032, + "rewards/rejected": -1.8304786682128906, + "step": 500 + }, + { + "epoch": 0.13085579691180318, + "eval_logits/chosen": -1.6658297777175903, + "eval_logits/rejected": -1.56244695186615, + "eval_logps/chosen": -379.18035888671875, + "eval_logps/rejected": -420.46356201171875, + "eval_loss": 0.5725830793380737, + "eval_rewards/accuracies": 0.6915000081062317, + "eval_rewards/chosen": -1.1457839012145996, + "eval_rewards/margins": 0.6131481528282166, + "eval_rewards/rejected": -1.758932113647461, + "eval_runtime": 1597.6305, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 500 + }, + { + "epoch": 0.13347291285003926, + "grad_norm": 9.327789306640625, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": -1.7511212825775146, + "logits/rejected": -1.6526283025741577, + "logps/chosen": -403.2107238769531, + "logps/rejected": -449.2831115722656, + "loss": 0.5133, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2511955499649048, + "rewards/margins": 0.830196738243103, + "rewards/rejected": -2.0813920497894287, + "step": 510 + }, + { + "epoch": 0.1360900287882753, + "grad_norm": 20.807098388671875, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": -1.7725025415420532, + "logits/rejected": -1.7354393005371094, + "logps/chosen": -416.3721618652344, + "logps/rejected": -457.4166564941406, + "loss": 0.5914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4335078001022339, + "rewards/margins": 0.5734153985977173, + "rewards/rejected": -2.006923198699951, + "step": 520 + }, + { + "epoch": 0.13870714472651138, + "grad_norm": 9.319575309753418, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": -1.8060506582260132, + "logits/rejected": -1.7954254150390625, + "logps/chosen": -347.8722229003906, + "logps/rejected": -408.69842529296875, + "loss": 0.568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8692201375961304, + "rewards/margins": 0.5232836008071899, + "rewards/rejected": -1.3925037384033203, + "step": 530 + }, + { + "epoch": 0.14132426066474746, + "grad_norm": 18.703466415405273, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": -1.718746542930603, + "logits/rejected": -1.6051177978515625, + "logps/chosen": -397.3692932128906, + "logps/rejected": -437.828125, + "loss": 0.4615, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1902121305465698, + "rewards/margins": 0.8715072870254517, + "rewards/rejected": -2.0617194175720215, + "step": 540 + }, + { + "epoch": 0.1439413766029835, + "grad_norm": 13.324700355529785, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": -1.5196049213409424, + "logits/rejected": -1.463122010231018, + "logps/chosen": -523.3265380859375, + "logps/rejected": -561.0288696289062, + "loss": 0.6068, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.449286937713623, + "rewards/margins": 0.5750513076782227, + "rewards/rejected": -3.0243382453918457, + "step": 550 + }, + { + "epoch": 0.14655849254121958, + "grad_norm": 7.8647990226745605, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": -1.6284644603729248, + "logits/rejected": -1.54505455493927, + "logps/chosen": -521.0708618164062, + "logps/rejected": -584.5469970703125, + "loss": 0.5398, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.636112689971924, + "rewards/margins": 0.6315088868141174, + "rewards/rejected": -3.2676215171813965, + "step": 560 + }, + { + "epoch": 0.14917560847945563, + "grad_norm": 8.891359329223633, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": -1.7360155582427979, + "logits/rejected": -1.6161645650863647, + "logps/chosen": -575.2686767578125, + "logps/rejected": -591.63671875, + "loss": 0.6015, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.752664566040039, + "rewards/margins": 0.5114163160324097, + "rewards/rejected": -3.264080762863159, + "step": 570 + }, + { + "epoch": 0.1517927244176917, + "grad_norm": 13.137563705444336, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": -1.8721704483032227, + "logits/rejected": -1.7023900747299194, + "logps/chosen": -481.35992431640625, + "logps/rejected": -496.80792236328125, + "loss": 0.5479, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8363927602767944, + "rewards/margins": 0.565157949924469, + "rewards/rejected": -2.401550769805908, + "step": 580 + }, + { + "epoch": 0.15440984035592778, + "grad_norm": 9.32674789428711, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": -1.9033949375152588, + "logits/rejected": -1.81440007686615, + "logps/chosen": -351.5168762207031, + "logps/rejected": -374.45159912109375, + "loss": 0.5774, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9435514211654663, + "rewards/margins": 0.46548739075660706, + "rewards/rejected": -1.4090386629104614, + "step": 590 + }, + { + "epoch": 0.15702695629416383, + "grad_norm": 8.5282621383667, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": -1.9967750310897827, + "logits/rejected": -1.959011435508728, + "logps/chosen": -324.1109924316406, + "logps/rejected": -367.7194519042969, + "loss": 0.5373, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7343847155570984, + "rewards/margins": 0.5855700373649597, + "rewards/rejected": -1.319954752922058, + "step": 600 + }, + { + "epoch": 0.15702695629416383, + "eval_logits/chosen": -1.7954951524734497, + "eval_logits/rejected": -1.6945267915725708, + "eval_logps/chosen": -377.46051025390625, + "eval_logps/rejected": -426.21209716796875, + "eval_loss": 0.5631101727485657, + "eval_rewards/accuracies": 0.703000009059906, + "eval_rewards/chosen": -1.1285854578018188, + "eval_rewards/margins": 0.6878318190574646, + "eval_rewards/rejected": -1.8164173364639282, + "eval_runtime": 1597.4817, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 600 + }, + { + "epoch": 0.1596440722323999, + "grad_norm": 8.468684196472168, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": -1.8055137395858765, + "logits/rejected": -1.7273778915405273, + "logps/chosen": -379.7837829589844, + "logps/rejected": -452.4205627441406, + "loss": 0.5374, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3405410051345825, + "rewards/margins": 0.9033535122871399, + "rewards/rejected": -2.243894577026367, + "step": 610 + }, + { + "epoch": 0.16226118817063595, + "grad_norm": 8.468870162963867, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": -1.712264060974121, + "logits/rejected": -1.5265555381774902, + "logps/chosen": -442.0708923339844, + "logps/rejected": -476.46112060546875, + "loss": 0.5459, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5436513423919678, + "rewards/margins": 0.9296489953994751, + "rewards/rejected": -2.4733004570007324, + "step": 620 + }, + { + "epoch": 0.16487830410887203, + "grad_norm": 8.517386436462402, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": -1.650029182434082, + "logits/rejected": -1.5338895320892334, + "logps/chosen": -383.0647888183594, + "logps/rejected": -462.663330078125, + "loss": 0.5098, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3505860567092896, + "rewards/margins": 0.8982070684432983, + "rewards/rejected": -2.248793125152588, + "step": 630 + }, + { + "epoch": 0.16749542004710807, + "grad_norm": 7.095081329345703, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": -1.6547809839248657, + "logits/rejected": -1.4826760292053223, + "logps/chosen": -434.3949279785156, + "logps/rejected": -483.05657958984375, + "loss": 0.5251, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.691422462463379, + "rewards/margins": 0.7445409297943115, + "rewards/rejected": -2.4359633922576904, + "step": 640 + }, + { + "epoch": 0.17011253598534415, + "grad_norm": 16.53241539001465, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": -1.586260199546814, + "logits/rejected": -1.4878358840942383, + "logps/chosen": -451.49871826171875, + "logps/rejected": -535.3369750976562, + "loss": 0.5454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9628738164901733, + "rewards/margins": 0.9344732165336609, + "rewards/rejected": -2.8973469734191895, + "step": 650 + }, + { + "epoch": 0.17272965192358022, + "grad_norm": 12.95315170288086, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": -1.5459370613098145, + "logits/rejected": -1.3826103210449219, + "logps/chosen": -457.12640380859375, + "logps/rejected": -501.2940368652344, + "loss": 0.5561, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.9127668142318726, + "rewards/margins": 0.7934447526931763, + "rewards/rejected": -2.706211566925049, + "step": 660 + }, + { + "epoch": 0.17534676786181627, + "grad_norm": 13.084680557250977, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": -1.6595103740692139, + "logits/rejected": -1.5149381160736084, + "logps/chosen": -505.25360107421875, + "logps/rejected": -557.6201782226562, + "loss": 0.5516, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2182066440582275, + "rewards/margins": 0.930493950843811, + "rewards/rejected": -3.1487009525299072, + "step": 670 + }, + { + "epoch": 0.17796388380005235, + "grad_norm": 9.387535095214844, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": -1.5597246885299683, + "logits/rejected": -1.4202911853790283, + "logps/chosen": -468.02777099609375, + "logps/rejected": -508.26483154296875, + "loss": 0.5279, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9475713968276978, + "rewards/margins": 0.8734270334243774, + "rewards/rejected": -2.820998430252075, + "step": 680 + }, + { + "epoch": 0.1805809997382884, + "grad_norm": 10.536702156066895, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": -1.6048400402069092, + "logits/rejected": -1.5331923961639404, + "logps/chosen": -395.47686767578125, + "logps/rejected": -517.7385864257812, + "loss": 0.4506, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5932337045669556, + "rewards/margins": 1.1062676906585693, + "rewards/rejected": -2.6995015144348145, + "step": 690 + }, + { + "epoch": 0.18319811567652447, + "grad_norm": 6.061295509338379, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": -1.5165598392486572, + "logits/rejected": -1.4684141874313354, + "logps/chosen": -450.1396484375, + "logps/rejected": -525.9497680664062, + "loss": 0.5394, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9692981243133545, + "rewards/margins": 0.802649199962616, + "rewards/rejected": -2.7719473838806152, + "step": 700 + }, + { + "epoch": 0.18319811567652447, + "eval_logits/chosen": -1.271895408630371, + "eval_logits/rejected": -1.1628035306930542, + "eval_logps/chosen": -491.6012268066406, + "eval_logps/rejected": -551.1991577148438, + "eval_loss": 0.5473812222480774, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -2.2699923515319824, + "eval_rewards/margins": 0.7962960004806519, + "eval_rewards/rejected": -3.066288471221924, + "eval_runtime": 1597.7171, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 700 + }, + { + "epoch": 0.18581523161476055, + "grad_norm": 13.924752235412598, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": -1.3581098318099976, + "logits/rejected": -1.2599608898162842, + "logps/chosen": -515.8197631835938, + "logps/rejected": -587.2666625976562, + "loss": 0.5115, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4833767414093018, + "rewards/margins": 0.8504128456115723, + "rewards/rejected": -3.333789348602295, + "step": 710 + }, + { + "epoch": 0.1884323475529966, + "grad_norm": 9.558119773864746, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": -1.140836477279663, + "logits/rejected": -0.9971574544906616, + "logps/chosen": -565.4805908203125, + "logps/rejected": -648.9151611328125, + "loss": 0.4868, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9943554401397705, + "rewards/margins": 1.006219744682312, + "rewards/rejected": -4.000575065612793, + "step": 720 + }, + { + "epoch": 0.19104946349123267, + "grad_norm": 16.967636108398438, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": -1.0284086465835571, + "logits/rejected": -0.9112738370895386, + "logps/chosen": -624.4201049804688, + "logps/rejected": -673.4655151367188, + "loss": 0.6183, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.582063674926758, + "rewards/margins": 0.7967410087585449, + "rewards/rejected": -4.378804683685303, + "step": 730 + }, + { + "epoch": 0.19366657942946872, + "grad_norm": 17.04477310180664, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": -1.1650705337524414, + "logits/rejected": -1.0339478254318237, + "logps/chosen": -438.68585205078125, + "logps/rejected": -490.553955078125, + "loss": 0.497, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1054441928863525, + "rewards/margins": 0.8412445187568665, + "rewards/rejected": -2.946688652038574, + "step": 740 + }, + { + "epoch": 0.1962836953677048, + "grad_norm": 20.108728408813477, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": -1.2052314281463623, + "logits/rejected": -1.157947301864624, + "logps/chosen": -439.97589111328125, + "logps/rejected": -518.9736328125, + "loss": 0.5097, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0552916526794434, + "rewards/margins": 0.809428870677948, + "rewards/rejected": -2.864720106124878, + "step": 750 + }, + { + "epoch": 0.19890081130594087, + "grad_norm": 6.034134387969971, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": -1.3641645908355713, + "logits/rejected": -1.3035409450531006, + "logps/chosen": -421.71343994140625, + "logps/rejected": -494.55810546875, + "loss": 0.5004, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4936286211013794, + "rewards/margins": 0.8079965710639954, + "rewards/rejected": -2.3016250133514404, + "step": 760 + }, + { + "epoch": 0.20151792724417691, + "grad_norm": 5.774670124053955, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": -1.4722392559051514, + "logits/rejected": -1.3768599033355713, + "logps/chosen": -374.771484375, + "logps/rejected": -418.16656494140625, + "loss": 0.5331, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2165186405181885, + "rewards/margins": 0.6629087328910828, + "rewards/rejected": -1.8794273138046265, + "step": 770 + }, + { + "epoch": 0.204135043182413, + "grad_norm": 8.666418075561523, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": -1.63046395778656, + "logits/rejected": -1.5205990076065063, + "logps/chosen": -421.2227478027344, + "logps/rejected": -473.2460021972656, + "loss": 0.5528, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3603990077972412, + "rewards/margins": 0.7691534161567688, + "rewards/rejected": -2.1295523643493652, + "step": 780 + }, + { + "epoch": 0.20675215912064904, + "grad_norm": 15.76352310180664, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": -1.4890462160110474, + "logits/rejected": -1.4975069761276245, + "logps/chosen": -429.3702087402344, + "logps/rejected": -498.40496826171875, + "loss": 0.5898, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7876300811767578, + "rewards/margins": 0.6727088093757629, + "rewards/rejected": -2.460339069366455, + "step": 790 + }, + { + "epoch": 0.2093692750588851, + "grad_norm": 14.872965812683105, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": -1.6982934474945068, + "logits/rejected": -1.4959887266159058, + "logps/chosen": -433.38470458984375, + "logps/rejected": -451.4923400878906, + "loss": 0.4983, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.708482027053833, + "rewards/margins": 0.8051029443740845, + "rewards/rejected": -2.513584852218628, + "step": 800 + }, + { + "epoch": 0.2093692750588851, + "eval_logits/chosen": -1.599565863609314, + "eval_logits/rejected": -1.510375738143921, + "eval_logps/chosen": -420.76544189453125, + "eval_logps/rejected": -474.2269287109375, + "eval_loss": 0.5322815179824829, + "eval_rewards/accuracies": 0.7225000262260437, + "eval_rewards/chosen": -1.5616350173950195, + "eval_rewards/margins": 0.7349306344985962, + "eval_rewards/rejected": -2.2965660095214844, + "eval_runtime": 1595.9174, + "eval_samples_per_second": 1.253, + "eval_steps_per_second": 0.157, + "step": 800 + }, + { + "epoch": 0.21198639099712116, + "grad_norm": 6.158263683319092, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": -1.8132537603378296, + "logits/rejected": -1.8140255212783813, + "logps/chosen": -400.74359130859375, + "logps/rejected": -452.96392822265625, + "loss": 0.58, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2691619396209717, + "rewards/margins": 0.5926799178123474, + "rewards/rejected": -1.8618419170379639, + "step": 810 + }, + { + "epoch": 0.21460350693535724, + "grad_norm": 14.991472244262695, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": -1.883763313293457, + "logits/rejected": -1.9024746417999268, + "logps/chosen": -327.5868225097656, + "logps/rejected": -404.1509704589844, + "loss": 0.5258, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9834963083267212, + "rewards/margins": 0.6419707536697388, + "rewards/rejected": -1.6254669427871704, + "step": 820 + }, + { + "epoch": 0.2172206228735933, + "grad_norm": 10.492687225341797, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": -1.8571357727050781, + "logits/rejected": -1.813534140586853, + "logps/chosen": -396.38372802734375, + "logps/rejected": -451.81634521484375, + "loss": 0.5868, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2799537181854248, + "rewards/margins": 0.5913030505180359, + "rewards/rejected": -1.8712568283081055, + "step": 830 + }, + { + "epoch": 0.21983773881182936, + "grad_norm": 10.114534378051758, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": -1.8008487224578857, + "logits/rejected": -1.698720932006836, + "logps/chosen": -378.2573547363281, + "logps/rejected": -424.20782470703125, + "loss": 0.5402, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.244909405708313, + "rewards/margins": 0.7323002815246582, + "rewards/rejected": -1.9772096872329712, + "step": 840 + }, + { + "epoch": 0.22245485475006543, + "grad_norm": 7.237858295440674, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": -1.6909987926483154, + "logits/rejected": -1.64451003074646, + "logps/chosen": -384.024169921875, + "logps/rejected": -455.19024658203125, + "loss": 0.5251, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3631868362426758, + "rewards/margins": 0.7588311433792114, + "rewards/rejected": -2.1220178604125977, + "step": 850 + }, + { + "epoch": 0.22507197068830148, + "grad_norm": 20.08100700378418, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": -1.5939120054244995, + "logits/rejected": -1.5511482954025269, + "logps/chosen": -447.6766662597656, + "logps/rejected": -501.3028259277344, + "loss": 0.569, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7521028518676758, + "rewards/margins": 0.6410677433013916, + "rewards/rejected": -2.3931705951690674, + "step": 860 + }, + { + "epoch": 0.22768908662653756, + "grad_norm": 8.703922271728516, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": -1.5731687545776367, + "logits/rejected": -1.5117802619934082, + "logps/chosen": -424.31463623046875, + "logps/rejected": -501.74725341796875, + "loss": 0.5338, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6849206686019897, + "rewards/margins": 0.7798541784286499, + "rewards/rejected": -2.4647748470306396, + "step": 870 + }, + { + "epoch": 0.23030620256477363, + "grad_norm": 7.03041410446167, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": -1.6322410106658936, + "logits/rejected": -1.613351583480835, + "logps/chosen": -416.93463134765625, + "logps/rejected": -476.6209411621094, + "loss": 0.5234, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5381996631622314, + "rewards/margins": 0.7384254932403564, + "rewards/rejected": -2.276625156402588, + "step": 880 + }, + { + "epoch": 0.23292331850300968, + "grad_norm": 7.439998149871826, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": -1.6738321781158447, + "logits/rejected": -1.6679372787475586, + "logps/chosen": -351.4508056640625, + "logps/rejected": -435.82196044921875, + "loss": 0.546, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0406602621078491, + "rewards/margins": 0.7058707475662231, + "rewards/rejected": -1.7465311288833618, + "step": 890 + }, + { + "epoch": 0.23554043444124576, + "grad_norm": 14.334321975708008, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": -1.5745285749435425, + "logits/rejected": -1.511311650276184, + "logps/chosen": -331.2271423339844, + "logps/rejected": -421.39752197265625, + "loss": 0.4763, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.024084210395813, + "rewards/margins": 0.9915010333061218, + "rewards/rejected": -2.015585422515869, + "step": 900 + }, + { + "epoch": 0.23554043444124576, + "eval_logits/chosen": -1.4989054203033447, + "eval_logits/rejected": -1.415571928024292, + "eval_logps/chosen": -425.90301513671875, + "eval_logps/rejected": -485.7890319824219, + "eval_loss": 0.5385720133781433, + "eval_rewards/accuracies": 0.7160000205039978, + "eval_rewards/chosen": -1.6130101680755615, + "eval_rewards/margins": 0.7991763353347778, + "eval_rewards/rejected": -2.412186622619629, + "eval_runtime": 1597.0973, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 900 + }, + { + "epoch": 0.2381575503794818, + "grad_norm": 12.946525573730469, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": -1.5601496696472168, + "logits/rejected": -1.4170053005218506, + "logps/chosen": -460.2450256347656, + "logps/rejected": -471.73272705078125, + "loss": 0.5673, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.759927749633789, + "rewards/margins": 0.6714185476303101, + "rewards/rejected": -2.4313464164733887, + "step": 910 + }, + { + "epoch": 0.24077466631771788, + "grad_norm": 7.0246663093566895, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": -1.5280827283859253, + "logits/rejected": -1.4348738193511963, + "logps/chosen": -420.05682373046875, + "logps/rejected": -474.1004333496094, + "loss": 0.5575, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5702658891677856, + "rewards/margins": 0.7144767642021179, + "rewards/rejected": -2.284742832183838, + "step": 920 + }, + { + "epoch": 0.24339178225595393, + "grad_norm": 10.933419227600098, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": -1.7351709604263306, + "logits/rejected": -1.6752073764801025, + "logps/chosen": -409.02301025390625, + "logps/rejected": -459.7960510253906, + "loss": 0.5172, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3725465536117554, + "rewards/margins": 0.7059783935546875, + "rewards/rejected": -2.0785250663757324, + "step": 930 + }, + { + "epoch": 0.24600889819419, + "grad_norm": 9.32016372680664, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": -1.6696386337280273, + "logits/rejected": -1.5904427766799927, + "logps/chosen": -443.6543884277344, + "logps/rejected": -504.19293212890625, + "loss": 0.4894, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5346601009368896, + "rewards/margins": 0.8928316235542297, + "rewards/rejected": -2.4274916648864746, + "step": 940 + }, + { + "epoch": 0.24862601413242608, + "grad_norm": 15.52425765991211, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": -1.7015752792358398, + "logits/rejected": -1.627673864364624, + "logps/chosen": -428.1681213378906, + "logps/rejected": -498.12530517578125, + "loss": 0.5229, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8192806243896484, + "rewards/margins": 0.843016505241394, + "rewards/rejected": -2.662297248840332, + "step": 950 + }, + { + "epoch": 0.2512431300706621, + "grad_norm": 9.358636856079102, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": -1.7376630306243896, + "logits/rejected": -1.6711089611053467, + "logps/chosen": -425.44720458984375, + "logps/rejected": -465.4773864746094, + "loss": 0.5612, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5372945070266724, + "rewards/margins": 0.6410431861877441, + "rewards/rejected": -2.178337812423706, + "step": 960 + }, + { + "epoch": 0.25386024600889817, + "grad_norm": 42.93048095703125, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": -1.6823867559432983, + "logits/rejected": -1.5265601873397827, + "logps/chosen": -445.4508361816406, + "logps/rejected": -489.4844665527344, + "loss": 0.53, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.655534029006958, + "rewards/margins": 0.9063774347305298, + "rewards/rejected": -2.5619113445281982, + "step": 970 + }, + { + "epoch": 0.2564773619471343, + "grad_norm": 10.477679252624512, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": -1.5334604978561401, + "logits/rejected": -1.4493004083633423, + "logps/chosen": -509.8636779785156, + "logps/rejected": -587.6615600585938, + "loss": 0.4944, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.265300750732422, + "rewards/margins": 0.9821793437004089, + "rewards/rejected": -3.2474799156188965, + "step": 980 + }, + { + "epoch": 0.2590944778853703, + "grad_norm": 12.81888198852539, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": -1.4550929069519043, + "logits/rejected": -1.353437066078186, + "logps/chosen": -502.6724548339844, + "logps/rejected": -594.8109130859375, + "loss": 0.4652, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6365292072296143, + "rewards/margins": 1.1491750478744507, + "rewards/rejected": -3.7857041358947754, + "step": 990 + }, + { + "epoch": 0.26171159382360637, + "grad_norm": 12.723479270935059, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": -1.5442698001861572, + "logits/rejected": -1.4192157983779907, + "logps/chosen": -524.4293212890625, + "logps/rejected": -580.075439453125, + "loss": 0.5266, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2106876373291016, + "rewards/margins": 0.9188516736030579, + "rewards/rejected": -3.1295390129089355, + "step": 1000 + }, + { + "epoch": 0.26171159382360637, + "eval_logits/chosen": -1.3050363063812256, + "eval_logits/rejected": -1.2043476104736328, + "eval_logps/chosen": -482.4830627441406, + "eval_logps/rejected": -550.0311279296875, + "eval_loss": 0.523389995098114, + "eval_rewards/accuracies": 0.7279999852180481, + "eval_rewards/chosen": -2.1788110733032227, + "eval_rewards/margins": 0.8757960796356201, + "eval_rewards/rejected": -3.0546071529388428, + "eval_runtime": 1595.4607, + "eval_samples_per_second": 1.254, + "eval_steps_per_second": 0.157, + "step": 1000 + }, + { + "epoch": 0.2643287097618425, + "grad_norm": 10.429312705993652, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": -1.3304941654205322, + "logits/rejected": -1.263293743133545, + "logps/chosen": -446.70635986328125, + "logps/rejected": -535.6898193359375, + "loss": 0.4782, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.131920099258423, + "rewards/margins": 0.9416648745536804, + "rewards/rejected": -3.073584794998169, + "step": 1010 + }, + { + "epoch": 0.2669458257000785, + "grad_norm": 16.054344177246094, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": -1.4466055631637573, + "logits/rejected": -1.3539087772369385, + "logps/chosen": -452.9022521972656, + "logps/rejected": -500.742919921875, + "loss": 0.5407, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9927232265472412, + "rewards/margins": 0.8132057189941406, + "rewards/rejected": -2.8059287071228027, + "step": 1020 + }, + { + "epoch": 0.26956294163831457, + "grad_norm": 7.860910415649414, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": -1.514672875404358, + "logits/rejected": -1.447291374206543, + "logps/chosen": -394.26031494140625, + "logps/rejected": -461.3570251464844, + "loss": 0.5174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.536415934562683, + "rewards/margins": 0.7608687281608582, + "rewards/rejected": -2.2972846031188965, + "step": 1030 + }, + { + "epoch": 0.2721800575765506, + "grad_norm": 13.006108283996582, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": -1.559757113456726, + "logits/rejected": -1.4329888820648193, + "logps/chosen": -482.7850646972656, + "logps/rejected": -568.3600463867188, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8161367177963257, + "rewards/margins": 1.1671006679534912, + "rewards/rejected": -2.9832375049591064, + "step": 1040 + }, + { + "epoch": 0.2747971735147867, + "grad_norm": 10.500707626342773, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": -1.5195045471191406, + "logits/rejected": -1.4307035207748413, + "logps/chosen": -455.4422912597656, + "logps/rejected": -578.11181640625, + "loss": 0.4731, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.8703197240829468, + "rewards/margins": 1.292755126953125, + "rewards/rejected": -3.1630749702453613, + "step": 1050 + }, + { + "epoch": 0.27741428945302277, + "grad_norm": 8.591322898864746, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": -1.5150272846221924, + "logits/rejected": -1.4561691284179688, + "logps/chosen": -402.49053955078125, + "logps/rejected": -479.2098083496094, + "loss": 0.5844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6649020910263062, + "rewards/margins": 0.8239375352859497, + "rewards/rejected": -2.488839626312256, + "step": 1060 + }, + { + "epoch": 0.2800314053912588, + "grad_norm": 12.471731185913086, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": -1.598962664604187, + "logits/rejected": -1.4552241563796997, + "logps/chosen": -423.9444274902344, + "logps/rejected": -476.2308654785156, + "loss": 0.4524, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3588608503341675, + "rewards/margins": 1.0158392190933228, + "rewards/rejected": -2.3747003078460693, + "step": 1070 + }, + { + "epoch": 0.2826485213294949, + "grad_norm": 8.733776092529297, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": -1.2865254878997803, + "logits/rejected": -1.242490530014038, + "logps/chosen": -467.01776123046875, + "logps/rejected": -511.63507080078125, + "loss": 0.5623, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.8692843914031982, + "rewards/margins": 0.7691918611526489, + "rewards/rejected": -2.6384763717651367, + "step": 1080 + }, + { + "epoch": 0.28526563726773096, + "grad_norm": 9.051119804382324, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": -1.2762900590896606, + "logits/rejected": -1.1670513153076172, + "logps/chosen": -481.23406982421875, + "logps/rejected": -555.0721435546875, + "loss": 0.5154, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2087130546569824, + "rewards/margins": 0.9038770794868469, + "rewards/rejected": -3.1125903129577637, + "step": 1090 + }, + { + "epoch": 0.287882753205967, + "grad_norm": 7.552842617034912, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": -1.240541934967041, + "logits/rejected": -1.0851424932479858, + "logps/chosen": -478.9158630371094, + "logps/rejected": -492.19097900390625, + "loss": 0.59, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.127556562423706, + "rewards/margins": 0.5717890858650208, + "rewards/rejected": -2.699345827102661, + "step": 1100 + }, + { + "epoch": 0.287882753205967, + "eval_logits/chosen": -1.1099953651428223, + "eval_logits/rejected": -0.9899115562438965, + "eval_logps/chosen": -433.97100830078125, + "eval_logps/rejected": -478.8385314941406, + "eval_loss": 0.5277644991874695, + "eval_rewards/accuracies": 0.7300000190734863, + "eval_rewards/chosen": -1.693690299987793, + "eval_rewards/margins": 0.6489914059638977, + "eval_rewards/rejected": -2.342681646347046, + "eval_runtime": 1596.8217, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 1100 + }, + { + "epoch": 0.2904998691442031, + "grad_norm": 5.962321758270264, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": -1.3310126066207886, + "logits/rejected": -1.2818472385406494, + "logps/chosen": -426.1422424316406, + "logps/rejected": -471.98175048828125, + "loss": 0.572, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5579837560653687, + "rewards/margins": 0.5076408982276917, + "rewards/rejected": -2.065624713897705, + "step": 1110 + }, + { + "epoch": 0.29311698508243916, + "grad_norm": 5.4544477462768555, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": -1.3914196491241455, + "logits/rejected": -1.305474877357483, + "logps/chosen": -344.81463623046875, + "logps/rejected": -411.66412353515625, + "loss": 0.5147, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0368053913116455, + "rewards/margins": 0.6763306856155396, + "rewards/rejected": -1.7131359577178955, + "step": 1120 + }, + { + "epoch": 0.2957341010206752, + "grad_norm": 12.404394149780273, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": -1.3826755285263062, + "logits/rejected": -1.251558780670166, + "logps/chosen": -408.0195007324219, + "logps/rejected": -476.11114501953125, + "loss": 0.5433, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4701926708221436, + "rewards/margins": 0.7369771003723145, + "rewards/rejected": -2.207169771194458, + "step": 1130 + }, + { + "epoch": 0.29835121695891126, + "grad_norm": 28.483901977539062, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": -1.342377781867981, + "logits/rejected": -1.165569543838501, + "logps/chosen": -453.5723571777344, + "logps/rejected": -509.80731201171875, + "loss": 0.4798, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6950212717056274, + "rewards/margins": 1.041156530380249, + "rewards/rejected": -2.736177921295166, + "step": 1140 + }, + { + "epoch": 0.30096833289714736, + "grad_norm": 9.420351028442383, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": -1.3900500535964966, + "logits/rejected": -1.2638782262802124, + "logps/chosen": -445.68646240234375, + "logps/rejected": -537.0733642578125, + "loss": 0.4689, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7342853546142578, + "rewards/margins": 1.1538056135177612, + "rewards/rejected": -2.8880913257598877, + "step": 1150 + }, + { + "epoch": 0.3035854488353834, + "grad_norm": 11.15323543548584, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": -1.3719291687011719, + "logits/rejected": -1.2517584562301636, + "logps/chosen": -444.84375, + "logps/rejected": -515.7125244140625, + "loss": 0.5478, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8111295700073242, + "rewards/margins": 0.9737447500228882, + "rewards/rejected": -2.784874439239502, + "step": 1160 + }, + { + "epoch": 0.30620256477361946, + "grad_norm": 11.627602577209473, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": -1.5677311420440674, + "logits/rejected": -1.4189679622650146, + "logps/chosen": -418.11309814453125, + "logps/rejected": -474.2850646972656, + "loss": 0.5335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5903016328811646, + "rewards/margins": 0.9656845331192017, + "rewards/rejected": -2.555985927581787, + "step": 1170 + }, + { + "epoch": 0.30881968071185556, + "grad_norm": 10.718832969665527, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": -1.2974934577941895, + "logits/rejected": -1.3446776866912842, + "logps/chosen": -414.12152099609375, + "logps/rejected": -495.92340087890625, + "loss": 0.5394, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6758350133895874, + "rewards/margins": 0.7650316953659058, + "rewards/rejected": -2.440866708755493, + "step": 1180 + }, + { + "epoch": 0.3114367966500916, + "grad_norm": 8.005572319030762, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": -1.4092941284179688, + "logits/rejected": -1.264432668685913, + "logps/chosen": -408.0471496582031, + "logps/rejected": -483.52960205078125, + "loss": 0.5148, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.605602502822876, + "rewards/margins": 0.8261173963546753, + "rewards/rejected": -2.431720018386841, + "step": 1190 + }, + { + "epoch": 0.31405391258832765, + "grad_norm": 7.756717681884766, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": -1.2672417163848877, + "logits/rejected": -1.1932761669158936, + "logps/chosen": -391.0654602050781, + "logps/rejected": -470.7328186035156, + "loss": 0.5724, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6141713857650757, + "rewards/margins": 0.7146965265274048, + "rewards/rejected": -2.3288679122924805, + "step": 1200 + }, + { + "epoch": 0.31405391258832765, + "eval_logits/chosen": -1.2473385334014893, + "eval_logits/rejected": -1.1348552703857422, + "eval_logps/chosen": -420.0863342285156, + "eval_logps/rejected": -485.28948974609375, + "eval_loss": 0.5071337819099426, + "eval_rewards/accuracies": 0.7379999756813049, + "eval_rewards/chosen": -1.5548440217971802, + "eval_rewards/margins": 0.8523474335670471, + "eval_rewards/rejected": -2.407191514968872, + "eval_runtime": 1597.3419, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 1200 + }, + { + "epoch": 0.3166710285265637, + "grad_norm": 5.270259857177734, + "learning_rate": 4.319478895246e-06, + "logits/chosen": -1.3534616231918335, + "logits/rejected": -1.1892000436782837, + "logps/chosen": -396.1217346191406, + "logps/rejected": -454.44378662109375, + "loss": 0.5071, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.490412950515747, + "rewards/margins": 0.8302758932113647, + "rewards/rejected": -2.3206887245178223, + "step": 1210 + }, + { + "epoch": 0.3192881444647998, + "grad_norm": 12.79388427734375, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": -1.3265999555587769, + "logits/rejected": -1.2219207286834717, + "logps/chosen": -440.6163024902344, + "logps/rejected": -568.152099609375, + "loss": 0.4921, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.835211992263794, + "rewards/margins": 1.1568537950515747, + "rewards/rejected": -2.992065906524658, + "step": 1220 + }, + { + "epoch": 0.32190526040303585, + "grad_norm": 13.922798156738281, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": -1.3599532842636108, + "logits/rejected": -1.237168550491333, + "logps/chosen": -486.5572204589844, + "logps/rejected": -522.0225219726562, + "loss": 0.5566, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9279435873031616, + "rewards/margins": 0.8177730441093445, + "rewards/rejected": -2.7457165718078613, + "step": 1230 + }, + { + "epoch": 0.3245223763412719, + "grad_norm": 7.278261184692383, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": -1.3576252460479736, + "logits/rejected": -1.2608816623687744, + "logps/chosen": -445.9983825683594, + "logps/rejected": -495.5048828125, + "loss": 0.4942, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.587226152420044, + "rewards/margins": 0.910873293876648, + "rewards/rejected": -2.4980995655059814, + "step": 1240 + }, + { + "epoch": 0.327139492279508, + "grad_norm": 8.15560531616211, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": -1.3735462427139282, + "logits/rejected": -1.210055947303772, + "logps/chosen": -444.36285400390625, + "logps/rejected": -524.8583374023438, + "loss": 0.5542, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8914082050323486, + "rewards/margins": 1.0044190883636475, + "rewards/rejected": -2.895827293395996, + "step": 1250 + }, + { + "epoch": 0.32975660821774405, + "grad_norm": 10.175187110900879, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": -1.4199110269546509, + "logits/rejected": -1.317662000656128, + "logps/chosen": -492.12139892578125, + "logps/rejected": -561.2523803710938, + "loss": 0.5923, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.361861228942871, + "rewards/margins": 0.8468947410583496, + "rewards/rejected": -3.2087559700012207, + "step": 1260 + }, + { + "epoch": 0.3323737241559801, + "grad_norm": 8.98975944519043, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": -1.5209752321243286, + "logits/rejected": -1.3899322748184204, + "logps/chosen": -398.7652282714844, + "logps/rejected": -442.14892578125, + "loss": 0.5594, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4862077236175537, + "rewards/margins": 0.761127769947052, + "rewards/rejected": -2.247335195541382, + "step": 1270 + }, + { + "epoch": 0.33499084009421615, + "grad_norm": 9.442636489868164, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": -1.5363355875015259, + "logits/rejected": -1.428397536277771, + "logps/chosen": -410.4297790527344, + "logps/rejected": -474.6888122558594, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6462541818618774, + "rewards/margins": 0.8527911305427551, + "rewards/rejected": -2.4990451335906982, + "step": 1280 + }, + { + "epoch": 0.33760795603245225, + "grad_norm": 9.517970085144043, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": -1.4206923246383667, + "logits/rejected": -1.3242676258087158, + "logps/chosen": -455.42474365234375, + "logps/rejected": -532.8533325195312, + "loss": 0.5055, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.0629706382751465, + "rewards/margins": 0.9739512205123901, + "rewards/rejected": -3.036921977996826, + "step": 1290 + }, + { + "epoch": 0.3402250719706883, + "grad_norm": 11.92896842956543, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": -1.458961844444275, + "logits/rejected": -1.4081146717071533, + "logps/chosen": -429.62548828125, + "logps/rejected": -503.53582763671875, + "loss": 0.5457, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7603120803833008, + "rewards/margins": 0.771033525466919, + "rewards/rejected": -2.5313456058502197, + "step": 1300 + }, + { + "epoch": 0.3402250719706883, + "eval_logits/chosen": -1.3403185606002808, + "eval_logits/rejected": -1.242436408996582, + "eval_logps/chosen": -440.03851318359375, + "eval_logps/rejected": -507.2138366699219, + "eval_loss": 0.5013459920883179, + "eval_rewards/accuracies": 0.7434999942779541, + "eval_rewards/chosen": -1.7543656826019287, + "eval_rewards/margins": 0.8720693588256836, + "eval_rewards/rejected": -2.6264350414276123, + "eval_runtime": 1597.2746, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 1300 + }, + { + "epoch": 0.34284218790892435, + "grad_norm": 7.771608352661133, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": -1.5240622758865356, + "logits/rejected": -1.3838030099868774, + "logps/chosen": -408.34857177734375, + "logps/rejected": -462.7867736816406, + "loss": 0.5133, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6171270608901978, + "rewards/margins": 0.8352136611938477, + "rewards/rejected": -2.452340602874756, + "step": 1310 + }, + { + "epoch": 0.34545930384716045, + "grad_norm": 13.282800674438477, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": -1.4314453601837158, + "logits/rejected": -1.3674726486206055, + "logps/chosen": -449.40045166015625, + "logps/rejected": -546.0065307617188, + "loss": 0.5029, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9253730773925781, + "rewards/margins": 0.9314772486686707, + "rewards/rejected": -2.8568501472473145, + "step": 1320 + }, + { + "epoch": 0.3480764197853965, + "grad_norm": 5.933595180511475, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": -1.3017045259475708, + "logits/rejected": -1.251571536064148, + "logps/chosen": -431.25537109375, + "logps/rejected": -517.0537109375, + "loss": 0.557, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.9745893478393555, + "rewards/margins": 0.9014847874641418, + "rewards/rejected": -2.8760738372802734, + "step": 1330 + }, + { + "epoch": 0.35069353572363254, + "grad_norm": 11.526862144470215, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": -1.3678683042526245, + "logits/rejected": -1.2736116647720337, + "logps/chosen": -471.77728271484375, + "logps/rejected": -537.7550048828125, + "loss": 0.5154, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0238394737243652, + "rewards/margins": 0.9198586344718933, + "rewards/rejected": -2.9436984062194824, + "step": 1340 + }, + { + "epoch": 0.35331065166186865, + "grad_norm": 10.23589038848877, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": -1.3284608125686646, + "logits/rejected": -1.217164397239685, + "logps/chosen": -450.1647033691406, + "logps/rejected": -507.2977600097656, + "loss": 0.569, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.8795936107635498, + "rewards/margins": 0.7713474631309509, + "rewards/rejected": -2.6509411334991455, + "step": 1350 + }, + { + "epoch": 0.3559277676001047, + "grad_norm": 9.470162391662598, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": -1.387274980545044, + "logits/rejected": -1.2676749229431152, + "logps/chosen": -405.6870422363281, + "logps/rejected": -467.76708984375, + "loss": 0.4982, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5023095607757568, + "rewards/margins": 0.8411477208137512, + "rewards/rejected": -2.3434574604034424, + "step": 1360 + }, + { + "epoch": 0.35854488353834074, + "grad_norm": 11.479103088378906, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": -1.4606144428253174, + "logits/rejected": -1.400508165359497, + "logps/chosen": -422.5201721191406, + "logps/rejected": -496.3780212402344, + "loss": 0.5403, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5399492979049683, + "rewards/margins": 0.7900384068489075, + "rewards/rejected": -2.3299877643585205, + "step": 1370 + }, + { + "epoch": 0.3611619994765768, + "grad_norm": 9.197821617126465, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": -1.6123807430267334, + "logits/rejected": -1.459542989730835, + "logps/chosen": -428.04095458984375, + "logps/rejected": -507.18243408203125, + "loss": 0.4608, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5614588260650635, + "rewards/margins": 1.0217015743255615, + "rewards/rejected": -2.583160161972046, + "step": 1380 + }, + { + "epoch": 0.3637791154148129, + "grad_norm": 13.988523483276367, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": -1.4627307653427124, + "logits/rejected": -1.3992432355880737, + "logps/chosen": -429.89642333984375, + "logps/rejected": -529.4677124023438, + "loss": 0.4813, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.753618836402893, + "rewards/margins": 1.0768356323242188, + "rewards/rejected": -2.8304543495178223, + "step": 1390 + }, + { + "epoch": 0.36639623135304894, + "grad_norm": 14.094236373901367, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": -1.6445674896240234, + "logits/rejected": -1.5534647703170776, + "logps/chosen": -442.5335388183594, + "logps/rejected": -506.78253173828125, + "loss": 0.5423, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7385400533676147, + "rewards/margins": 0.8896828889846802, + "rewards/rejected": -2.628222942352295, + "step": 1400 + }, + { + "epoch": 0.36639623135304894, + "eval_logits/chosen": -1.586852788925171, + "eval_logits/rejected": -1.5062702894210815, + "eval_logps/chosen": -428.40972900390625, + "eval_logps/rejected": -505.70770263671875, + "eval_loss": 0.5131703019142151, + "eval_rewards/accuracies": 0.7210000157356262, + "eval_rewards/chosen": -1.6380778551101685, + "eval_rewards/margins": 0.9732955098152161, + "eval_rewards/rejected": -2.6113734245300293, + "eval_runtime": 1597.5064, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 1400 + }, + { + "epoch": 0.369013347291285, + "grad_norm": 13.169591903686523, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": -1.6033601760864258, + "logits/rejected": -1.529076099395752, + "logps/chosen": -446.4644470214844, + "logps/rejected": -562.0938720703125, + "loss": 0.4006, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8283302783966064, + "rewards/margins": 1.3763213157653809, + "rewards/rejected": -3.2046515941619873, + "step": 1410 + }, + { + "epoch": 0.3716304632295211, + "grad_norm": 12.735097885131836, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": -1.6629726886749268, + "logits/rejected": -1.5727919340133667, + "logps/chosen": -460.08013916015625, + "logps/rejected": -542.1484375, + "loss": 0.5034, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7491388320922852, + "rewards/margins": 1.091073989868164, + "rewards/rejected": -2.8402130603790283, + "step": 1420 + }, + { + "epoch": 0.37424757916775714, + "grad_norm": 9.543456077575684, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": -1.5923559665679932, + "logits/rejected": -1.5246838331222534, + "logps/chosen": -397.0897216796875, + "logps/rejected": -477.6280212402344, + "loss": 0.5221, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4408881664276123, + "rewards/margins": 0.9207174181938171, + "rewards/rejected": -2.361605405807495, + "step": 1430 + }, + { + "epoch": 0.3768646951059932, + "grad_norm": 15.872808456420898, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": -1.6502765417099, + "logits/rejected": -1.5927629470825195, + "logps/chosen": -403.17657470703125, + "logps/rejected": -513.6209106445312, + "loss": 0.4272, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.527881145477295, + "rewards/margins": 1.3093104362487793, + "rewards/rejected": -2.837191581726074, + "step": 1440 + }, + { + "epoch": 0.37948181104422923, + "grad_norm": 13.932242393493652, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": -1.6036418676376343, + "logits/rejected": -1.5119550228118896, + "logps/chosen": -423.63275146484375, + "logps/rejected": -522.7612915039062, + "loss": 0.5484, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8413972854614258, + "rewards/margins": 1.1561634540557861, + "rewards/rejected": -2.997560501098633, + "step": 1450 + }, + { + "epoch": 0.38209892698246534, + "grad_norm": 17.43979835510254, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": -1.6155637502670288, + "logits/rejected": -1.546623706817627, + "logps/chosen": -433.8269958496094, + "logps/rejected": -493.6173400878906, + "loss": 0.5628, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8069992065429688, + "rewards/margins": 0.8752773404121399, + "rewards/rejected": -2.682276487350464, + "step": 1460 + }, + { + "epoch": 0.3847160429207014, + "grad_norm": 6.709349155426025, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": -1.6773513555526733, + "logits/rejected": -1.6156046390533447, + "logps/chosen": -408.98004150390625, + "logps/rejected": -496.5398864746094, + "loss": 0.4636, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6042149066925049, + "rewards/margins": 0.9984095692634583, + "rewards/rejected": -2.6026244163513184, + "step": 1470 + }, + { + "epoch": 0.38733315885893743, + "grad_norm": 5.378371715545654, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": -1.6842008829116821, + "logits/rejected": -1.5901457071304321, + "logps/chosen": -467.2499084472656, + "logps/rejected": -508.35491943359375, + "loss": 0.5207, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0820114612579346, + "rewards/margins": 0.9600709080696106, + "rewards/rejected": -3.0420823097229004, + "step": 1480 + }, + { + "epoch": 0.38995027479717354, + "grad_norm": 14.073667526245117, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": -1.6913728713989258, + "logits/rejected": -1.5630801916122437, + "logps/chosen": -425.8887634277344, + "logps/rejected": -511.43194580078125, + "loss": 0.4841, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7265421152114868, + "rewards/margins": 1.118648648262024, + "rewards/rejected": -2.84519100189209, + "step": 1490 + }, + { + "epoch": 0.3925673907354096, + "grad_norm": 15.713250160217285, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": -1.631744384765625, + "logits/rejected": -1.5060975551605225, + "logps/chosen": -430.0010681152344, + "logps/rejected": -537.3772583007812, + "loss": 0.4492, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.657666563987732, + "rewards/margins": 1.2653546333312988, + "rewards/rejected": -2.9230213165283203, + "step": 1500 + }, + { + "epoch": 0.3925673907354096, + "eval_logits/chosen": -1.5949609279632568, + "eval_logits/rejected": -1.4971818923950195, + "eval_logps/chosen": -423.41754150390625, + "eval_logps/rejected": -503.4827575683594, + "eval_loss": 0.5122300386428833, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.5881556272506714, + "eval_rewards/margins": 1.0009682178497314, + "eval_rewards/rejected": -2.5891237258911133, + "eval_runtime": 1597.6931, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 1500 + }, + { + "epoch": 0.39518450667364563, + "grad_norm": 14.919211387634277, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": -1.6476099491119385, + "logits/rejected": -1.5861533880233765, + "logps/chosen": -370.7307434082031, + "logps/rejected": -437.010986328125, + "loss": 0.5933, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.370052695274353, + "rewards/margins": 0.7218869924545288, + "rewards/rejected": -2.091939926147461, + "step": 1510 + }, + { + "epoch": 0.39780162261188173, + "grad_norm": 6.893444061279297, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": -1.7386844158172607, + "logits/rejected": -1.6610181331634521, + "logps/chosen": -358.55792236328125, + "logps/rejected": -410.8779296875, + "loss": 0.5166, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9495410919189453, + "rewards/margins": 0.7035711407661438, + "rewards/rejected": -1.6531124114990234, + "step": 1520 + }, + { + "epoch": 0.4004187385501178, + "grad_norm": 10.669229507446289, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": -1.6956592798233032, + "logits/rejected": -1.634338617324829, + "logps/chosen": -401.1066589355469, + "logps/rejected": -450.43701171875, + "loss": 0.5677, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2650973796844482, + "rewards/margins": 0.5999857187271118, + "rewards/rejected": -1.8650833368301392, + "step": 1530 + }, + { + "epoch": 0.40303585448835383, + "grad_norm": 13.775323867797852, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": -1.4220095872879028, + "logits/rejected": -1.3044965267181396, + "logps/chosen": -428.1224060058594, + "logps/rejected": -506.9508361816406, + "loss": 0.5483, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9053666591644287, + "rewards/margins": 0.9458476305007935, + "rewards/rejected": -2.8512144088745117, + "step": 1540 + }, + { + "epoch": 0.4056529704265899, + "grad_norm": 14.96325397491455, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": -1.4775934219360352, + "logits/rejected": -1.3828151226043701, + "logps/chosen": -484.23712158203125, + "logps/rejected": -562.1600341796875, + "loss": 0.4745, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1311516761779785, + "rewards/margins": 0.9796358942985535, + "rewards/rejected": -3.1107876300811768, + "step": 1550 + }, + { + "epoch": 0.408270086364826, + "grad_norm": 12.222273826599121, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": -1.5065914392471313, + "logits/rejected": -1.3439867496490479, + "logps/chosen": -507.56646728515625, + "logps/rejected": -579.8803100585938, + "loss": 0.5187, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.261667013168335, + "rewards/margins": 1.0928138494491577, + "rewards/rejected": -3.3544812202453613, + "step": 1560 + }, + { + "epoch": 0.410887202303062, + "grad_norm": 9.984075546264648, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": -1.5400466918945312, + "logits/rejected": -1.42914617061615, + "logps/chosen": -428.8944396972656, + "logps/rejected": -516.1829833984375, + "loss": 0.4994, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7761850357055664, + "rewards/margins": 1.021864891052246, + "rewards/rejected": -2.7980499267578125, + "step": 1570 + }, + { + "epoch": 0.4135043182412981, + "grad_norm": 8.096105575561523, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": -1.4683252573013306, + "logits/rejected": -1.4568402767181396, + "logps/chosen": -420.28662109375, + "logps/rejected": -531.93310546875, + "loss": 0.5363, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.67800772190094, + "rewards/margins": 0.8264617919921875, + "rewards/rejected": -2.504469394683838, + "step": 1580 + }, + { + "epoch": 0.4161214341795342, + "grad_norm": 8.454867362976074, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": -1.563232421875, + "logits/rejected": -1.5413362979888916, + "logps/chosen": -382.3138122558594, + "logps/rejected": -473.07427978515625, + "loss": 0.4969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4439641237258911, + "rewards/margins": 0.8402025103569031, + "rewards/rejected": -2.2841668128967285, + "step": 1590 + }, + { + "epoch": 0.4187385501177702, + "grad_norm": 8.18990421295166, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": -1.374459981918335, + "logits/rejected": -1.3057047128677368, + "logps/chosen": -437.67437744140625, + "logps/rejected": -498.42022705078125, + "loss": 0.5491, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7346376180648804, + "rewards/margins": 0.8208419680595398, + "rewards/rejected": -2.5554797649383545, + "step": 1600 + }, + { + "epoch": 0.4187385501177702, + "eval_logits/chosen": -1.2524830102920532, + "eval_logits/rejected": -1.129266381263733, + "eval_logps/chosen": -434.1912536621094, + "eval_logps/rejected": -515.1350708007812, + "eval_loss": 0.49564477801322937, + "eval_rewards/accuracies": 0.7394999861717224, + "eval_rewards/chosen": -1.6958929300308228, + "eval_rewards/margins": 1.0097541809082031, + "eval_rewards/rejected": -2.7056469917297363, + "eval_runtime": 1597.9006, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 1600 + }, + { + "epoch": 0.4213556660560063, + "grad_norm": 5.6236395835876465, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": -1.3308565616607666, + "logits/rejected": -1.2487056255340576, + "logps/chosen": -416.17218017578125, + "logps/rejected": -541.766357421875, + "loss": 0.4321, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7147547006607056, + "rewards/margins": 1.190341591835022, + "rewards/rejected": -2.9050965309143066, + "step": 1610 + }, + { + "epoch": 0.4239727819942423, + "grad_norm": 17.16832160949707, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": -1.3294860124588013, + "logits/rejected": -1.1633020639419556, + "logps/chosen": -514.1262817382812, + "logps/rejected": -586.5099487304688, + "loss": 0.5903, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5135364532470703, + "rewards/margins": 1.0195398330688477, + "rewards/rejected": -3.533076047897339, + "step": 1620 + }, + { + "epoch": 0.4265898979324784, + "grad_norm": 9.197494506835938, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": -1.1497808694839478, + "logits/rejected": -1.1394188404083252, + "logps/chosen": -455.3041076660156, + "logps/rejected": -552.9832763671875, + "loss": 0.5097, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2378275394439697, + "rewards/margins": 1.1191097497940063, + "rewards/rejected": -3.3569374084472656, + "step": 1630 + }, + { + "epoch": 0.42920701387071447, + "grad_norm": 5.227996826171875, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": -1.344987154006958, + "logits/rejected": -1.2656126022338867, + "logps/chosen": -442.0404357910156, + "logps/rejected": -519.5951538085938, + "loss": 0.5867, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8838779926300049, + "rewards/margins": 0.8285346031188965, + "rewards/rejected": -2.7124123573303223, + "step": 1640 + }, + { + "epoch": 0.4318241298089505, + "grad_norm": 6.703832149505615, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": -1.4452259540557861, + "logits/rejected": -1.273822546005249, + "logps/chosen": -400.95703125, + "logps/rejected": -432.19775390625, + "loss": 0.5857, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4979441165924072, + "rewards/margins": 0.6527599096298218, + "rewards/rejected": -2.1507039070129395, + "step": 1650 + }, + { + "epoch": 0.4344412457471866, + "grad_norm": 11.32523250579834, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": -1.386530876159668, + "logits/rejected": -1.324684500694275, + "logps/chosen": -355.1617736816406, + "logps/rejected": -427.02001953125, + "loss": 0.5318, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3329139947891235, + "rewards/margins": 0.6805238723754883, + "rewards/rejected": -2.0134379863739014, + "step": 1660 + }, + { + "epoch": 0.43705836168542267, + "grad_norm": 7.073009490966797, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": -1.3409563302993774, + "logits/rejected": -1.1487702131271362, + "logps/chosen": -427.2156677246094, + "logps/rejected": -493.6348571777344, + "loss": 0.4728, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.584120273590088, + "rewards/margins": 0.9814236760139465, + "rewards/rejected": -2.5655438899993896, + "step": 1670 + }, + { + "epoch": 0.4396754776236587, + "grad_norm": 11.820454597473145, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": -1.262634038925171, + "logits/rejected": -1.0713304281234741, + "logps/chosen": -481.8815002441406, + "logps/rejected": -564.7189331054688, + "loss": 0.4399, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9938242435455322, + "rewards/margins": 1.2474219799041748, + "rewards/rejected": -3.241246461868286, + "step": 1680 + }, + { + "epoch": 0.44229259356189476, + "grad_norm": 19.43402671813965, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": -1.2867403030395508, + "logits/rejected": -1.1040995121002197, + "logps/chosen": -548.404296875, + "logps/rejected": -614.572998046875, + "loss": 0.5467, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.703883647918701, + "rewards/margins": 1.1177372932434082, + "rewards/rejected": -3.821620464324951, + "step": 1690 + }, + { + "epoch": 0.44490970950013087, + "grad_norm": 18.481212615966797, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": -1.169626235961914, + "logits/rejected": -1.0800492763519287, + "logps/chosen": -525.806884765625, + "logps/rejected": -649.9021606445312, + "loss": 0.5408, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1594154834747314, + "rewards/margins": 1.1724364757537842, + "rewards/rejected": -4.331852436065674, + "step": 1700 + }, + { + "epoch": 0.44490970950013087, + "eval_logits/chosen": -1.1773556470870972, + "eval_logits/rejected": -1.0519527196884155, + "eval_logps/chosen": -568.2142333984375, + "eval_logps/rejected": -668.4868774414062, + "eval_loss": 0.5111355781555176, + "eval_rewards/accuracies": 0.7304999828338623, + "eval_rewards/chosen": -3.036123275756836, + "eval_rewards/margins": 1.203041672706604, + "eval_rewards/rejected": -4.23916482925415, + "eval_runtime": 1598.229, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 1700 + }, + { + "epoch": 0.4475268254383669, + "grad_norm": 17.149526596069336, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": -1.3090331554412842, + "logits/rejected": -1.292729139328003, + "logps/chosen": -502.16497802734375, + "logps/rejected": -622.8278198242188, + "loss": 0.5266, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7771124839782715, + "rewards/margins": 1.0713131427764893, + "rewards/rejected": -3.8484256267547607, + "step": 1710 + }, + { + "epoch": 0.45014394137660296, + "grad_norm": 12.321800231933594, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": -1.5342642068862915, + "logits/rejected": -1.4036284685134888, + "logps/chosen": -507.0011291503906, + "logps/rejected": -602.5591430664062, + "loss": 0.497, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3824760913848877, + "rewards/margins": 1.1078113317489624, + "rewards/rejected": -3.4902870655059814, + "step": 1720 + }, + { + "epoch": 0.45276105731483907, + "grad_norm": 7.4277191162109375, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": -1.5070991516113281, + "logits/rejected": -1.4292500019073486, + "logps/chosen": -427.82818603515625, + "logps/rejected": -509.09552001953125, + "loss": 0.4635, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6776756048202515, + "rewards/margins": 1.0597718954086304, + "rewards/rejected": -2.737447500228882, + "step": 1730 + }, + { + "epoch": 0.4553781732530751, + "grad_norm": 9.2946195602417, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": -1.5444905757904053, + "logits/rejected": -1.4140485525131226, + "logps/chosen": -468.56182861328125, + "logps/rejected": -515.2850341796875, + "loss": 0.5735, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9366995096206665, + "rewards/margins": 0.7699880599975586, + "rewards/rejected": -2.7066876888275146, + "step": 1740 + }, + { + "epoch": 0.45799528919131116, + "grad_norm": 7.504373073577881, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": -1.724323034286499, + "logits/rejected": -1.5582685470581055, + "logps/chosen": -436.670166015625, + "logps/rejected": -503.16705322265625, + "loss": 0.5256, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8786998987197876, + "rewards/margins": 0.8662854433059692, + "rewards/rejected": -2.7449851036071777, + "step": 1750 + }, + { + "epoch": 0.46061240512954726, + "grad_norm": 11.6707181930542, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": -1.552310824394226, + "logits/rejected": -1.5216782093048096, + "logps/chosen": -447.37835693359375, + "logps/rejected": -520.7014770507812, + "loss": 0.507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9179880619049072, + "rewards/margins": 0.9757116436958313, + "rewards/rejected": -2.8936996459960938, + "step": 1760 + }, + { + "epoch": 0.4632295210677833, + "grad_norm": 10.357820510864258, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": -1.6096198558807373, + "logits/rejected": -1.5589017868041992, + "logps/chosen": -452.2684631347656, + "logps/rejected": -553.5985717773438, + "loss": 0.4902, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9917962551116943, + "rewards/margins": 0.9700411558151245, + "rewards/rejected": -2.9618372917175293, + "step": 1770 + }, + { + "epoch": 0.46584663700601936, + "grad_norm": 8.833939552307129, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": -1.7435325384140015, + "logits/rejected": -1.6519057750701904, + "logps/chosen": -414.03179931640625, + "logps/rejected": -483.228515625, + "loss": 0.5437, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7762641906738281, + "rewards/margins": 0.8044807314872742, + "rewards/rejected": -2.580744504928589, + "step": 1780 + }, + { + "epoch": 0.4684637529442554, + "grad_norm": 9.815378189086914, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": -1.6989177465438843, + "logits/rejected": -1.6307258605957031, + "logps/chosen": -467.5003967285156, + "logps/rejected": -558.3304443359375, + "loss": 0.5356, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.923940896987915, + "rewards/margins": 0.9426455497741699, + "rewards/rejected": -2.866586446762085, + "step": 1790 + }, + { + "epoch": 0.4710808688824915, + "grad_norm": 10.215496063232422, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": -1.4613163471221924, + "logits/rejected": -1.3699003458023071, + "logps/chosen": -425.10430908203125, + "logps/rejected": -576.7496948242188, + "loss": 0.4705, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9493353366851807, + "rewards/margins": 1.254817247390747, + "rewards/rejected": -3.2041525840759277, + "step": 1800 + }, + { + "epoch": 0.4710808688824915, + "eval_logits/chosen": -1.4508103132247925, + "eval_logits/rejected": -1.3478518724441528, + "eval_logps/chosen": -476.96630859375, + "eval_logps/rejected": -563.5121459960938, + "eval_loss": 0.49488988518714905, + "eval_rewards/accuracies": 0.7434999942779541, + "eval_rewards/chosen": -2.123643636703491, + "eval_rewards/margins": 1.0657742023468018, + "eval_rewards/rejected": -3.189417839050293, + "eval_runtime": 1598.3976, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 1800 + }, + { + "epoch": 0.47369798482072756, + "grad_norm": 19.882471084594727, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": -1.4092731475830078, + "logits/rejected": -1.2594877481460571, + "logps/chosen": -483.1397399902344, + "logps/rejected": -564.5753173828125, + "loss": 0.4771, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.345776081085205, + "rewards/margins": 1.1159356832504272, + "rewards/rejected": -3.4617114067077637, + "step": 1810 + }, + { + "epoch": 0.4763151007589636, + "grad_norm": 13.595392227172852, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": -1.4161027669906616, + "logits/rejected": -1.3486690521240234, + "logps/chosen": -579.3997192382812, + "logps/rejected": -658.3812255859375, + "loss": 0.588, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.961474895477295, + "rewards/margins": 0.9501525163650513, + "rewards/rejected": -3.9116275310516357, + "step": 1820 + }, + { + "epoch": 0.4789322166971997, + "grad_norm": 16.07357406616211, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": -1.3573070764541626, + "logits/rejected": -1.3251278400421143, + "logps/chosen": -537.996826171875, + "logps/rejected": -629.4193115234375, + "loss": 0.5297, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.90761399269104, + "rewards/margins": 1.0439841747283936, + "rewards/rejected": -3.9515984058380127, + "step": 1830 + }, + { + "epoch": 0.48154933263543576, + "grad_norm": 6.550096035003662, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": -1.5519769191741943, + "logits/rejected": -1.4119082689285278, + "logps/chosen": -502.74102783203125, + "logps/rejected": -586.8773193359375, + "loss": 0.5118, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.1812124252319336, + "rewards/margins": 1.1252543926239014, + "rewards/rejected": -3.306466579437256, + "step": 1840 + }, + { + "epoch": 0.4841664485736718, + "grad_norm": 7.670591354370117, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": -1.464906930923462, + "logits/rejected": -1.32064950466156, + "logps/chosen": -504.308349609375, + "logps/rejected": -599.1265869140625, + "loss": 0.4443, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.1736900806427, + "rewards/margins": 1.1892703771591187, + "rewards/rejected": -3.3629603385925293, + "step": 1850 + }, + { + "epoch": 0.48678356451190785, + "grad_norm": 9.109015464782715, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": -1.249629020690918, + "logits/rejected": -1.168210744857788, + "logps/chosen": -578.9190063476562, + "logps/rejected": -669.9890747070312, + "loss": 0.5669, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.928832530975342, + "rewards/margins": 1.1118719577789307, + "rewards/rejected": -4.040704250335693, + "step": 1860 + }, + { + "epoch": 0.48940068045014395, + "grad_norm": 10.058290481567383, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": -1.2091599702835083, + "logits/rejected": -1.0555975437164307, + "logps/chosen": -534.0882568359375, + "logps/rejected": -660.2562255859375, + "loss": 0.4437, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.618621826171875, + "rewards/margins": 1.4526309967041016, + "rewards/rejected": -4.071252822875977, + "step": 1870 + }, + { + "epoch": 0.49201779638838, + "grad_norm": 10.198968887329102, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": -1.303740382194519, + "logits/rejected": -1.155242681503296, + "logps/chosen": -507.42529296875, + "logps/rejected": -583.2550048828125, + "loss": 0.4895, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.374905824661255, + "rewards/margins": 1.1803052425384521, + "rewards/rejected": -3.555210828781128, + "step": 1880 + }, + { + "epoch": 0.49463491232661605, + "grad_norm": 10.858015060424805, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": -1.3813010454177856, + "logits/rejected": -1.2690826654434204, + "logps/chosen": -486.0970153808594, + "logps/rejected": -569.85888671875, + "loss": 0.5031, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0988192558288574, + "rewards/margins": 1.0918813943862915, + "rewards/rejected": -3.1907010078430176, + "step": 1890 + }, + { + "epoch": 0.49725202826485215, + "grad_norm": 10.221612930297852, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": -1.4239078760147095, + "logits/rejected": -1.2654526233673096, + "logps/chosen": -474.64825439453125, + "logps/rejected": -587.3582763671875, + "loss": 0.4447, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9338430166244507, + "rewards/margins": 1.3446595668792725, + "rewards/rejected": -3.2785022258758545, + "step": 1900 + }, + { + "epoch": 0.49725202826485215, + "eval_logits/chosen": -1.2950754165649414, + "eval_logits/rejected": -1.1710810661315918, + "eval_logps/chosen": -468.10113525390625, + "eval_logps/rejected": -559.6229248046875, + "eval_loss": 0.49835336208343506, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -2.0349912643432617, + "eval_rewards/margins": 1.1155344247817993, + "eval_rewards/rejected": -3.1505255699157715, + "eval_runtime": 1598.3655, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 1900 + }, + { + "epoch": 0.4998691442030882, + "grad_norm": 12.752076148986816, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": -1.3872668743133545, + "logits/rejected": -1.265047311782837, + "logps/chosen": -489.17291259765625, + "logps/rejected": -553.0140991210938, + "loss": 0.5231, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.242345094680786, + "rewards/margins": 1.008666753768921, + "rewards/rejected": -3.251011610031128, + "step": 1910 + }, + { + "epoch": 0.5024862601413242, + "grad_norm": 10.04758071899414, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": -1.368890404701233, + "logits/rejected": -1.2567319869995117, + "logps/chosen": -493.54638671875, + "logps/rejected": -586.2965087890625, + "loss": 0.5251, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.339406728744507, + "rewards/margins": 1.0563156604766846, + "rewards/rejected": -3.3957226276397705, + "step": 1920 + }, + { + "epoch": 0.5051033760795604, + "grad_norm": 9.104912757873535, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": -1.4171912670135498, + "logits/rejected": -1.2455161809921265, + "logps/chosen": -490.3606872558594, + "logps/rejected": -558.6239013671875, + "loss": 0.4792, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2399258613586426, + "rewards/margins": 1.0637315511703491, + "rewards/rejected": -3.3036580085754395, + "step": 1930 + }, + { + "epoch": 0.5077204920177963, + "grad_norm": 11.299101829528809, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": -1.4033154249191284, + "logits/rejected": -1.307733178138733, + "logps/chosen": -491.1643981933594, + "logps/rejected": -589.1052856445312, + "loss": 0.5129, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.32118558883667, + "rewards/margins": 1.0772285461425781, + "rewards/rejected": -3.398413896560669, + "step": 1940 + }, + { + "epoch": 0.5103376079560324, + "grad_norm": 18.843233108520508, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": -1.4243779182434082, + "logits/rejected": -1.2675760984420776, + "logps/chosen": -507.2261657714844, + "logps/rejected": -582.9666748046875, + "loss": 0.5307, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3146495819091797, + "rewards/margins": 1.0160276889801025, + "rewards/rejected": -3.330677032470703, + "step": 1950 + }, + { + "epoch": 0.5129547238942685, + "grad_norm": 9.132381439208984, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": -1.4892734289169312, + "logits/rejected": -1.310719609260559, + "logps/chosen": -494.809814453125, + "logps/rejected": -583.18798828125, + "loss": 0.4256, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1708030700683594, + "rewards/margins": 1.2277300357818604, + "rewards/rejected": -3.398533582687378, + "step": 1960 + }, + { + "epoch": 0.5155718398325045, + "grad_norm": 13.682202339172363, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": -1.2465951442718506, + "logits/rejected": -1.113993525505066, + "logps/chosen": -583.8721313476562, + "logps/rejected": -649.005126953125, + "loss": 0.5177, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.103384494781494, + "rewards/margins": 1.1054545640945435, + "rewards/rejected": -4.208839416503906, + "step": 1970 + }, + { + "epoch": 0.5181889557707406, + "grad_norm": 13.151285171508789, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": -1.1965068578720093, + "logits/rejected": -1.2104531526565552, + "logps/chosen": -526.9979248046875, + "logps/rejected": -667.6734619140625, + "loss": 0.5585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9773271083831787, + "rewards/margins": 1.2037287950515747, + "rewards/rejected": -4.181056022644043, + "step": 1980 + }, + { + "epoch": 0.5208060717089767, + "grad_norm": 7.820137023925781, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": -1.4265800714492798, + "logits/rejected": -1.3070918321609497, + "logps/chosen": -457.6055603027344, + "logps/rejected": -541.1328735351562, + "loss": 0.5257, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.098328113555908, + "rewards/margins": 0.9863009452819824, + "rewards/rejected": -3.0846290588378906, + "step": 1990 + }, + { + "epoch": 0.5234231876472127, + "grad_norm": 12.656340599060059, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": -1.6775627136230469, + "logits/rejected": -1.5273171663284302, + "logps/chosen": -465.67138671875, + "logps/rejected": -513.0640869140625, + "loss": 0.4561, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7640680074691772, + "rewards/margins": 1.0338157415390015, + "rewards/rejected": -2.797883987426758, + "step": 2000 + }, + { + "epoch": 0.5234231876472127, + "eval_logits/chosen": -1.4695755243301392, + "eval_logits/rejected": -1.3557151556015015, + "eval_logps/chosen": -461.283935546875, + "eval_logps/rejected": -540.4462280273438, + "eval_loss": 0.4929336607456207, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -1.9668195247650146, + "eval_rewards/margins": 0.9919391870498657, + "eval_rewards/rejected": -2.95875883102417, + "eval_runtime": 1598.5788, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 2000 + }, + { + "epoch": 0.5260403035854488, + "grad_norm": 10.136625289916992, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": -1.506194829940796, + "logits/rejected": -1.4691191911697388, + "logps/chosen": -475.35992431640625, + "logps/rejected": -562.2814331054688, + "loss": 0.5025, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.106142520904541, + "rewards/margins": 0.9343917965888977, + "rewards/rejected": -3.040534257888794, + "step": 2010 + }, + { + "epoch": 0.528657419523685, + "grad_norm": 10.457864761352539, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": -1.5337005853652954, + "logits/rejected": -1.3907279968261719, + "logps/chosen": -485.59637451171875, + "logps/rejected": -580.8443603515625, + "loss": 0.4664, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.3269832134246826, + "rewards/margins": 1.1330835819244385, + "rewards/rejected": -3.4600670337677, + "step": 2020 + }, + { + "epoch": 0.5312745354619209, + "grad_norm": 7.218365669250488, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": -1.4807672500610352, + "logits/rejected": -1.308292031288147, + "logps/chosen": -573.4922485351562, + "logps/rejected": -676.8171997070312, + "loss": 0.4288, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.787353038787842, + "rewards/margins": 1.456699252128601, + "rewards/rejected": -4.244051933288574, + "step": 2030 + }, + { + "epoch": 0.533891651400157, + "grad_norm": 14.370759010314941, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": -1.194697380065918, + "logits/rejected": -1.1315624713897705, + "logps/chosen": -666.68896484375, + "logps/rejected": -775.71240234375, + "loss": 0.5124, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.9625601768493652, + "rewards/margins": 1.2808544635772705, + "rewards/rejected": -5.243414878845215, + "step": 2040 + }, + { + "epoch": 0.5365087673383931, + "grad_norm": 12.651047706604004, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": -1.1674007177352905, + "logits/rejected": -0.9596608877182007, + "logps/chosen": -729.123291015625, + "logps/rejected": -793.8466796875, + "loss": 0.547, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.807937145233154, + "rewards/margins": 1.1525070667266846, + "rewards/rejected": -5.960444450378418, + "step": 2050 + }, + { + "epoch": 0.5391258832766291, + "grad_norm": 6.089327335357666, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": -1.2956593036651611, + "logits/rejected": -1.1621012687683105, + "logps/chosen": -649.8438720703125, + "logps/rejected": -758.2276000976562, + "loss": 0.4846, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.689380645751953, + "rewards/margins": 1.3314837217330933, + "rewards/rejected": -5.020864009857178, + "step": 2060 + }, + { + "epoch": 0.5417429992148652, + "grad_norm": 7.40191650390625, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": -1.2692714929580688, + "logits/rejected": -1.248396635055542, + "logps/chosen": -549.1704711914062, + "logps/rejected": -643.1365966796875, + "loss": 0.5449, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.976423740386963, + "rewards/margins": 0.864033579826355, + "rewards/rejected": -3.8404572010040283, + "step": 2070 + }, + { + "epoch": 0.5443601151531012, + "grad_norm": 9.540526390075684, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": -1.3864257335662842, + "logits/rejected": -1.2374814748764038, + "logps/chosen": -515.8956909179688, + "logps/rejected": -582.565185546875, + "loss": 0.5054, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6055984497070312, + "rewards/margins": 0.9696043729782104, + "rewards/rejected": -3.5752029418945312, + "step": 2080 + }, + { + "epoch": 0.5469772310913373, + "grad_norm": 11.426345825195312, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": -1.3803369998931885, + "logits/rejected": -1.2163236141204834, + "logps/chosen": -550.672607421875, + "logps/rejected": -663.0771484375, + "loss": 0.4469, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5704407691955566, + "rewards/margins": 1.2492586374282837, + "rewards/rejected": -3.81969952583313, + "step": 2090 + }, + { + "epoch": 0.5495943470295734, + "grad_norm": 10.400588035583496, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": -1.3159221410751343, + "logits/rejected": -1.1753368377685547, + "logps/chosen": -523.4752197265625, + "logps/rejected": -638.3160400390625, + "loss": 0.5068, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8226757049560547, + "rewards/margins": 1.3093383312225342, + "rewards/rejected": -4.132014274597168, + "step": 2100 + }, + { + "epoch": 0.5495943470295734, + "eval_logits/chosen": -1.2426347732543945, + "eval_logits/rejected": -1.1150033473968506, + "eval_logps/chosen": -579.1231079101562, + "eval_logps/rejected": -680.8953857421875, + "eval_loss": 0.49687275290489197, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -3.145211935043335, + "eval_rewards/margins": 1.2180382013320923, + "eval_rewards/rejected": -4.363250255584717, + "eval_runtime": 1598.2336, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 2100 + }, + { + "epoch": 0.5522114629678094, + "grad_norm": 13.339751243591309, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": -1.3830143213272095, + "logits/rejected": -1.2277696132659912, + "logps/chosen": -579.9473876953125, + "logps/rejected": -659.97412109375, + "loss": 0.5192, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.041943073272705, + "rewards/margins": 1.2124286890029907, + "rewards/rejected": -4.254371643066406, + "step": 2110 + }, + { + "epoch": 0.5548285789060455, + "grad_norm": 10.70681095123291, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": -1.3805171251296997, + "logits/rejected": -1.3095520734786987, + "logps/chosen": -527.185302734375, + "logps/rejected": -602.2315673828125, + "loss": 0.4666, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8171143531799316, + "rewards/margins": 1.0927484035491943, + "rewards/rejected": -3.909862518310547, + "step": 2120 + }, + { + "epoch": 0.5574456948442816, + "grad_norm": 13.302124977111816, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": -1.4553115367889404, + "logits/rejected": -1.3466088771820068, + "logps/chosen": -556.26611328125, + "logps/rejected": -608.5457763671875, + "loss": 0.5919, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8050436973571777, + "rewards/margins": 0.8486245274543762, + "rewards/rejected": -3.653668165206909, + "step": 2130 + }, + { + "epoch": 0.5600628107825176, + "grad_norm": 9.734771728515625, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": -1.5308691263198853, + "logits/rejected": -1.3970110416412354, + "logps/chosen": -516.7899169921875, + "logps/rejected": -586.8917236328125, + "loss": 0.4794, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.5241827964782715, + "rewards/margins": 1.0863929986953735, + "rewards/rejected": -3.6105754375457764, + "step": 2140 + }, + { + "epoch": 0.5626799267207537, + "grad_norm": 16.153608322143555, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": -1.4662330150604248, + "logits/rejected": -1.3525559902191162, + "logps/chosen": -481.68035888671875, + "logps/rejected": -539.03515625, + "loss": 0.5497, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4799540042877197, + "rewards/margins": 0.8992452621459961, + "rewards/rejected": -3.379199266433716, + "step": 2150 + }, + { + "epoch": 0.5652970426589898, + "grad_norm": 13.256193161010742, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": -1.5737719535827637, + "logits/rejected": -1.3470098972320557, + "logps/chosen": -527.2965698242188, + "logps/rejected": -575.4271240234375, + "loss": 0.4882, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.403984546661377, + "rewards/margins": 1.1281957626342773, + "rewards/rejected": -3.532180070877075, + "step": 2160 + }, + { + "epoch": 0.5679141585972258, + "grad_norm": 13.135820388793945, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": -1.4236756563186646, + "logits/rejected": -1.263068437576294, + "logps/chosen": -507.3004455566406, + "logps/rejected": -562.6768798828125, + "loss": 0.5195, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.490262746810913, + "rewards/margins": 0.9420592188835144, + "rewards/rejected": -3.4323222637176514, + "step": 2170 + }, + { + "epoch": 0.5705312745354619, + "grad_norm": 11.776784896850586, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": -1.4430488348007202, + "logits/rejected": -1.3176742792129517, + "logps/chosen": -504.53009033203125, + "logps/rejected": -551.927490234375, + "loss": 0.5579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.546835422515869, + "rewards/margins": 0.8456700444221497, + "rewards/rejected": -3.392504930496216, + "step": 2180 + }, + { + "epoch": 0.573148390473698, + "grad_norm": 8.283063888549805, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": -1.4085118770599365, + "logits/rejected": -1.3014566898345947, + "logps/chosen": -492.5560607910156, + "logps/rejected": -586.7115478515625, + "loss": 0.5087, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.376844882965088, + "rewards/margins": 0.9645326733589172, + "rewards/rejected": -3.3413777351379395, + "step": 2190 + }, + { + "epoch": 0.575765506411934, + "grad_norm": 6.720417022705078, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": -1.529714822769165, + "logits/rejected": -1.415728211402893, + "logps/chosen": -480.97955322265625, + "logps/rejected": -568.6060791015625, + "loss": 0.4839, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.196598529815674, + "rewards/margins": 1.081301212310791, + "rewards/rejected": -3.2778995037078857, + "step": 2200 + }, + { + "epoch": 0.575765506411934, + "eval_logits/chosen": -1.3886340856552124, + "eval_logits/rejected": -1.2705532312393188, + "eval_logps/chosen": -502.5681457519531, + "eval_logps/rejected": -588.3314819335938, + "eval_loss": 0.49266016483306885, + "eval_rewards/accuracies": 0.7404999732971191, + "eval_rewards/chosen": -2.379661798477173, + "eval_rewards/margins": 1.0579497814178467, + "eval_rewards/rejected": -3.4376115798950195, + "eval_runtime": 1597.9898, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 2200 + }, + { + "epoch": 0.5783826223501701, + "grad_norm": 5.171483039855957, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": -1.5503586530685425, + "logits/rejected": -1.394595742225647, + "logps/chosen": -533.165771484375, + "logps/rejected": -615.0496826171875, + "loss": 0.4996, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.578037738800049, + "rewards/margins": 1.0000585317611694, + "rewards/rejected": -3.5780959129333496, + "step": 2210 + }, + { + "epoch": 0.5809997382884062, + "grad_norm": 8.2252836227417, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": -1.3457584381103516, + "logits/rejected": -1.2957208156585693, + "logps/chosen": -549.8411865234375, + "logps/rejected": -641.9370727539062, + "loss": 0.5669, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.760831356048584, + "rewards/margins": 1.0501973628997803, + "rewards/rejected": -3.811028242111206, + "step": 2220 + }, + { + "epoch": 0.5836168542266422, + "grad_norm": 8.77107048034668, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": -1.3714348077774048, + "logits/rejected": -1.294524908065796, + "logps/chosen": -516.0252685546875, + "logps/rejected": -604.4722900390625, + "loss": 0.4741, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.39119291305542, + "rewards/margins": 1.2340974807739258, + "rewards/rejected": -3.6252903938293457, + "step": 2230 + }, + { + "epoch": 0.5862339701648783, + "grad_norm": 13.218793869018555, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": -1.4534399509429932, + "logits/rejected": -1.355835199356079, + "logps/chosen": -526.3766479492188, + "logps/rejected": -626.1098022460938, + "loss": 0.5044, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2947020530700684, + "rewards/margins": 1.1731860637664795, + "rewards/rejected": -3.467888355255127, + "step": 2240 + }, + { + "epoch": 0.5888510861031143, + "grad_norm": 7.438356876373291, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": -1.4775898456573486, + "logits/rejected": -1.375610113143921, + "logps/chosen": -440.20208740234375, + "logps/rejected": -556.2523193359375, + "loss": 0.3962, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.712977647781372, + "rewards/margins": 1.3840254545211792, + "rewards/rejected": -3.0970029830932617, + "step": 2250 + }, + { + "epoch": 0.5914682020413504, + "grad_norm": 13.876564025878906, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": -1.4080345630645752, + "logits/rejected": -1.2920736074447632, + "logps/chosen": -475.05548095703125, + "logps/rejected": -532.099365234375, + "loss": 0.5823, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.039393901824951, + "rewards/margins": 0.848638653755188, + "rewards/rejected": -2.8880326747894287, + "step": 2260 + }, + { + "epoch": 0.5940853179795865, + "grad_norm": 10.589466094970703, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": -1.447265386581421, + "logits/rejected": -1.3221790790557861, + "logps/chosen": -478.97149658203125, + "logps/rejected": -525.7181396484375, + "loss": 0.5126, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9702937602996826, + "rewards/margins": 0.9898223876953125, + "rewards/rejected": -2.960115909576416, + "step": 2270 + }, + { + "epoch": 0.5967024339178225, + "grad_norm": 13.875945091247559, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": -1.3917208909988403, + "logits/rejected": -1.2089694738388062, + "logps/chosen": -480.66595458984375, + "logps/rejected": -556.7701416015625, + "loss": 0.462, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.19204044342041, + "rewards/margins": 1.0662552118301392, + "rewards/rejected": -3.2582955360412598, + "step": 2280 + }, + { + "epoch": 0.5993195498560586, + "grad_norm": 9.446721076965332, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": -1.3553434610366821, + "logits/rejected": -1.238797903060913, + "logps/chosen": -493.4701232910156, + "logps/rejected": -565.8324584960938, + "loss": 0.4824, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3467798233032227, + "rewards/margins": 1.046331524848938, + "rewards/rejected": -3.393110990524292, + "step": 2290 + }, + { + "epoch": 0.6019366657942947, + "grad_norm": 9.440372467041016, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": -1.4484026432037354, + "logits/rejected": -1.2413018941879272, + "logps/chosen": -548.2637939453125, + "logps/rejected": -634.7371826171875, + "loss": 0.4729, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7083754539489746, + "rewards/margins": 1.2052128314971924, + "rewards/rejected": -3.913588285446167, + "step": 2300 + }, + { + "epoch": 0.6019366657942947, + "eval_logits/chosen": -1.2145209312438965, + "eval_logits/rejected": -1.0868196487426758, + "eval_logps/chosen": -549.21240234375, + "eval_logps/rejected": -656.666748046875, + "eval_loss": 0.49239280819892883, + "eval_rewards/accuracies": 0.7404999732971191, + "eval_rewards/chosen": -2.846104383468628, + "eval_rewards/margins": 1.2748597860336304, + "eval_rewards/rejected": -4.120964050292969, + "eval_runtime": 1598.0392, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 2300 + }, + { + "epoch": 0.6045537817325307, + "grad_norm": 15.882525444030762, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": -1.1735626459121704, + "logits/rejected": -1.07330322265625, + "logps/chosen": -511.028564453125, + "logps/rejected": -598.4305419921875, + "loss": 0.5852, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8660171031951904, + "rewards/margins": 1.2002947330474854, + "rewards/rejected": -4.066311836242676, + "step": 2310 + }, + { + "epoch": 0.6071708976707668, + "grad_norm": 8.326827049255371, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": -1.3162415027618408, + "logits/rejected": -1.1979725360870361, + "logps/chosen": -522.6014404296875, + "logps/rejected": -590.3125610351562, + "loss": 0.5189, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.673841714859009, + "rewards/margins": 1.005782127380371, + "rewards/rejected": -3.6796233654022217, + "step": 2320 + }, + { + "epoch": 0.6097880136090029, + "grad_norm": 14.694316864013672, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": -1.321624994277954, + "logits/rejected": -1.2669956684112549, + "logps/chosen": -527.9610595703125, + "logps/rejected": -604.3615112304688, + "loss": 0.5587, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.490408420562744, + "rewards/margins": 0.8247787356376648, + "rewards/rejected": -3.315187454223633, + "step": 2330 + }, + { + "epoch": 0.6124051295472389, + "grad_norm": 10.311464309692383, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": -1.270620346069336, + "logits/rejected": -1.120276927947998, + "logps/chosen": -485.13226318359375, + "logps/rejected": -588.9454956054688, + "loss": 0.4972, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5224430561065674, + "rewards/margins": 1.1178371906280518, + "rewards/rejected": -3.6402804851531982, + "step": 2340 + }, + { + "epoch": 0.615022245485475, + "grad_norm": 10.49399471282959, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": -1.304638147354126, + "logits/rejected": -1.1775546073913574, + "logps/chosen": -500.91339111328125, + "logps/rejected": -595.765625, + "loss": 0.4158, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.42271089553833, + "rewards/margins": 1.271071195602417, + "rewards/rejected": -3.693782329559326, + "step": 2350 + }, + { + "epoch": 0.6176393614237111, + "grad_norm": 10.7622652053833, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": -1.3254437446594238, + "logits/rejected": -1.1220028400421143, + "logps/chosen": -544.5164184570312, + "logps/rejected": -650.4244384765625, + "loss": 0.5071, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.812866687774658, + "rewards/margins": 1.2050769329071045, + "rewards/rejected": -4.017943382263184, + "step": 2360 + }, + { + "epoch": 0.6202564773619471, + "grad_norm": 9.485381126403809, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": -1.2691552639007568, + "logits/rejected": -1.0826399326324463, + "logps/chosen": -547.7079467773438, + "logps/rejected": -651.6534423828125, + "loss": 0.4229, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.8349509239196777, + "rewards/margins": 1.3276488780975342, + "rewards/rejected": -4.162599563598633, + "step": 2370 + }, + { + "epoch": 0.6228735933001832, + "grad_norm": 13.950716018676758, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": -1.116172432899475, + "logits/rejected": -1.0301882028579712, + "logps/chosen": -559.43896484375, + "logps/rejected": -674.9713134765625, + "loss": 0.4381, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.109766721725464, + "rewards/margins": 1.2503349781036377, + "rewards/rejected": -4.360101699829102, + "step": 2380 + }, + { + "epoch": 0.6254907092384192, + "grad_norm": 6.874395370483398, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": -1.1902521848678589, + "logits/rejected": -1.0676952600479126, + "logps/chosen": -581.3973999023438, + "logps/rejected": -685.8600463867188, + "loss": 0.4878, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.091068983078003, + "rewards/margins": 1.280792474746704, + "rewards/rejected": -4.371861457824707, + "step": 2390 + }, + { + "epoch": 0.6281078251766553, + "grad_norm": 8.19883918762207, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": -1.2159771919250488, + "logits/rejected": -1.0594358444213867, + "logps/chosen": -552.8912353515625, + "logps/rejected": -668.69921875, + "loss": 0.4501, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.900498390197754, + "rewards/margins": 1.363520622253418, + "rewards/rejected": -4.264018535614014, + "step": 2400 + }, + { + "epoch": 0.6281078251766553, + "eval_logits/chosen": -1.125669240951538, + "eval_logits/rejected": -0.9978408813476562, + "eval_logps/chosen": -562.0332641601562, + "eval_logps/rejected": -668.2345581054688, + "eval_loss": 0.489955335855484, + "eval_rewards/accuracies": 0.7429999709129333, + "eval_rewards/chosen": -2.974313735961914, + "eval_rewards/margins": 1.2623279094696045, + "eval_rewards/rejected": -4.236640930175781, + "eval_runtime": 1597.7712, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 2400 + }, + { + "epoch": 0.6307249411148914, + "grad_norm": 17.752099990844727, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": -1.2881437540054321, + "logits/rejected": -1.1982684135437012, + "logps/chosen": -541.344482421875, + "logps/rejected": -619.2030029296875, + "loss": 0.5127, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9722681045532227, + "rewards/margins": 1.0265535116195679, + "rewards/rejected": -3.998821258544922, + "step": 2410 + }, + { + "epoch": 0.6333420570531274, + "grad_norm": 8.487099647521973, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": -1.339787244796753, + "logits/rejected": -1.2054253816604614, + "logps/chosen": -510.6474609375, + "logps/rejected": -616.0956420898438, + "loss": 0.4786, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5842533111572266, + "rewards/margins": 1.2878291606903076, + "rewards/rejected": -3.872082233428955, + "step": 2420 + }, + { + "epoch": 0.6359591729913635, + "grad_norm": 14.060596466064453, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": -1.3402836322784424, + "logits/rejected": -1.2104113101959229, + "logps/chosen": -508.7401428222656, + "logps/rejected": -625.2578735351562, + "loss": 0.4786, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.520415782928467, + "rewards/margins": 1.2837390899658203, + "rewards/rejected": -3.804154872894287, + "step": 2430 + }, + { + "epoch": 0.6385762889295996, + "grad_norm": 9.59952449798584, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": -1.3208459615707397, + "logits/rejected": -1.141103982925415, + "logps/chosen": -542.9054565429688, + "logps/rejected": -597.6497802734375, + "loss": 0.5332, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7234339714050293, + "rewards/margins": 1.1302305459976196, + "rewards/rejected": -3.853663921356201, + "step": 2440 + }, + { + "epoch": 0.6411934048678356, + "grad_norm": 12.0162992477417, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": -1.309754490852356, + "logits/rejected": -1.1585099697113037, + "logps/chosen": -527.2728881835938, + "logps/rejected": -612.1050415039062, + "loss": 0.4561, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.4056506156921387, + "rewards/margins": 1.2328300476074219, + "rewards/rejected": -3.6384806632995605, + "step": 2450 + }, + { + "epoch": 0.6438105208060717, + "grad_norm": 15.278785705566406, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": -1.446975588798523, + "logits/rejected": -1.282833456993103, + "logps/chosen": -502.6387634277344, + "logps/rejected": -571.8133544921875, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2578883171081543, + "rewards/margins": 1.2107127904891968, + "rewards/rejected": -3.4686012268066406, + "step": 2460 + }, + { + "epoch": 0.6464276367443078, + "grad_norm": 8.051189422607422, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": -1.3973640203475952, + "logits/rejected": -1.3273109197616577, + "logps/chosen": -461.38665771484375, + "logps/rejected": -567.72509765625, + "loss": 0.5096, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.086717128753662, + "rewards/margins": 1.0224206447601318, + "rewards/rejected": -3.109138011932373, + "step": 2470 + }, + { + "epoch": 0.6490447526825438, + "grad_norm": 11.723711967468262, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": -1.2049771547317505, + "logits/rejected": -1.1100887060165405, + "logps/chosen": -472.3949279785156, + "logps/rejected": -566.0884399414062, + "loss": 0.4321, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.191418409347534, + "rewards/margins": 1.2799413204193115, + "rewards/rejected": -3.4713597297668457, + "step": 2480 + }, + { + "epoch": 0.6516618686207799, + "grad_norm": 18.185462951660156, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": -1.2619669437408447, + "logits/rejected": -1.2079191207885742, + "logps/chosen": -466.91845703125, + "logps/rejected": -579.8604125976562, + "loss": 0.4609, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3208956718444824, + "rewards/margins": 1.2844674587249756, + "rewards/rejected": -3.605363130569458, + "step": 2490 + }, + { + "epoch": 0.654278984559016, + "grad_norm": 14.226390838623047, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": -1.3449714183807373, + "logits/rejected": -1.212172508239746, + "logps/chosen": -500.9017028808594, + "logps/rejected": -603.9783325195312, + "loss": 0.4982, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4372124671936035, + "rewards/margins": 1.1106388568878174, + "rewards/rejected": -3.547851085662842, + "step": 2500 + }, + { + "epoch": 0.654278984559016, + "eval_logits/chosen": -1.1862049102783203, + "eval_logits/rejected": -1.0531891584396362, + "eval_logps/chosen": -510.45111083984375, + "eval_logps/rejected": -612.1486206054688, + "eval_loss": 0.4872073829174042, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -2.4584920406341553, + "eval_rewards/margins": 1.2172898054122925, + "eval_rewards/rejected": -3.675781488418579, + "eval_runtime": 1597.3321, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 2500 + }, + { + "epoch": 0.656896100497252, + "grad_norm": 10.693954467773438, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": -1.3317979574203491, + "logits/rejected": -1.2107937335968018, + "logps/chosen": -518.4515380859375, + "logps/rejected": -599.1994018554688, + "loss": 0.479, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.410822629928589, + "rewards/margins": 1.1930694580078125, + "rewards/rejected": -3.6038920879364014, + "step": 2510 + }, + { + "epoch": 0.6595132164354881, + "grad_norm": 10.660543441772461, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": -1.3140581846237183, + "logits/rejected": -1.059616208076477, + "logps/chosen": -549.7576293945312, + "logps/rejected": -629.7240600585938, + "loss": 0.4838, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.60799503326416, + "rewards/margins": 1.3822691440582275, + "rewards/rejected": -3.9902641773223877, + "step": 2520 + }, + { + "epoch": 0.6621303323737242, + "grad_norm": 9.327082633972168, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": -1.315731167793274, + "logits/rejected": -1.1592333316802979, + "logps/chosen": -516.8220825195312, + "logps/rejected": -651.5704956054688, + "loss": 0.4178, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7009530067443848, + "rewards/margins": 1.4539804458618164, + "rewards/rejected": -4.154933929443359, + "step": 2530 + }, + { + "epoch": 0.6647474483119602, + "grad_norm": 14.643211364746094, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": -1.1933174133300781, + "logits/rejected": -1.119011402130127, + "logps/chosen": -520.9530639648438, + "logps/rejected": -660.1638793945312, + "loss": 0.4109, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.762756824493408, + "rewards/margins": 1.4529297351837158, + "rewards/rejected": -4.215685844421387, + "step": 2540 + }, + { + "epoch": 0.6673645642501963, + "grad_norm": 23.931337356567383, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": -1.2723230123519897, + "logits/rejected": -1.1774482727050781, + "logps/chosen": -514.2723999023438, + "logps/rejected": -640.9010009765625, + "loss": 0.5081, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.768399477005005, + "rewards/margins": 1.3103950023651123, + "rewards/rejected": -4.078794002532959, + "step": 2550 + }, + { + "epoch": 0.6699816801884323, + "grad_norm": 10.859006881713867, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": -1.286709189414978, + "logits/rejected": -1.130676507949829, + "logps/chosen": -553.838134765625, + "logps/rejected": -631.344482421875, + "loss": 0.459, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.619520425796509, + "rewards/margins": 1.3386470079421997, + "rewards/rejected": -3.958167314529419, + "step": 2560 + }, + { + "epoch": 0.6725987961266684, + "grad_norm": 14.106218338012695, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": -1.3286449909210205, + "logits/rejected": -1.205742597579956, + "logps/chosen": -493.1719665527344, + "logps/rejected": -586.0244140625, + "loss": 0.5866, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.515252113342285, + "rewards/margins": 1.044698715209961, + "rewards/rejected": -3.559950590133667, + "step": 2570 + }, + { + "epoch": 0.6752159120649045, + "grad_norm": 6.197361469268799, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": -1.4131492376327515, + "logits/rejected": -1.3109676837921143, + "logps/chosen": -469.304931640625, + "logps/rejected": -598.4210815429688, + "loss": 0.4614, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.13527250289917, + "rewards/margins": 1.2581441402435303, + "rewards/rejected": -3.3934166431427, + "step": 2580 + }, + { + "epoch": 0.6778330280031405, + "grad_norm": 13.008953094482422, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": -1.3615756034851074, + "logits/rejected": -1.268027901649475, + "logps/chosen": -481.23358154296875, + "logps/rejected": -605.3997802734375, + "loss": 0.4241, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.3075268268585205, + "rewards/margins": 1.3260997533798218, + "rewards/rejected": -3.6336264610290527, + "step": 2590 + }, + { + "epoch": 0.6804501439413766, + "grad_norm": 10.1073637008667, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": -1.3610389232635498, + "logits/rejected": -1.198563814163208, + "logps/chosen": -530.4500732421875, + "logps/rejected": -605.023681640625, + "loss": 0.4649, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6202147006988525, + "rewards/margins": 1.1309864521026611, + "rewards/rejected": -3.7512009143829346, + "step": 2600 + }, + { + "epoch": 0.6804501439413766, + "eval_logits/chosen": -1.2114638090133667, + "eval_logits/rejected": -1.079287052154541, + "eval_logps/chosen": -522.1907958984375, + "eval_logps/rejected": -632.8793334960938, + "eval_loss": 0.48811665177345276, + "eval_rewards/accuracies": 0.7450000047683716, + "eval_rewards/chosen": -2.5758883953094482, + "eval_rewards/margins": 1.307201623916626, + "eval_rewards/rejected": -3.883090019226074, + "eval_runtime": 1597.3459, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 2600 + }, + { + "epoch": 0.6830672598796127, + "grad_norm": 9.867508888244629, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": -1.401888132095337, + "logits/rejected": -1.276673436164856, + "logps/chosen": -532.1458740234375, + "logps/rejected": -652.4072265625, + "loss": 0.4363, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.591766357421875, + "rewards/margins": 1.404159426689148, + "rewards/rejected": -3.9959254264831543, + "step": 2610 + }, + { + "epoch": 0.6856843758178487, + "grad_norm": 28.170438766479492, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": -1.209559440612793, + "logits/rejected": -1.0783087015151978, + "logps/chosen": -564.7611694335938, + "logps/rejected": -670.5452270507812, + "loss": 0.5678, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.0164246559143066, + "rewards/margins": 1.2787444591522217, + "rewards/rejected": -4.295169353485107, + "step": 2620 + }, + { + "epoch": 0.6883014917560848, + "grad_norm": 9.69133472442627, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": -1.3486610651016235, + "logits/rejected": -1.1891324520111084, + "logps/chosen": -538.7862548828125, + "logps/rejected": -633.6087646484375, + "loss": 0.4968, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6963882446289062, + "rewards/margins": 1.2195419073104858, + "rewards/rejected": -3.9159302711486816, + "step": 2630 + }, + { + "epoch": 0.6909186076943209, + "grad_norm": 11.340239524841309, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": -1.339261531829834, + "logits/rejected": -1.186591386795044, + "logps/chosen": -508.19189453125, + "logps/rejected": -621.9693603515625, + "loss": 0.4294, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.717308759689331, + "rewards/margins": 1.2740724086761475, + "rewards/rejected": -3.9913814067840576, + "step": 2640 + }, + { + "epoch": 0.6935357236325569, + "grad_norm": 8.626651763916016, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": -1.3360286951065063, + "logits/rejected": -1.1647727489471436, + "logps/chosen": -564.6326904296875, + "logps/rejected": -691.1988525390625, + "loss": 0.4326, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.678508996963501, + "rewards/margins": 1.511307954788208, + "rewards/rejected": -4.189816474914551, + "step": 2650 + }, + { + "epoch": 0.696152839570793, + "grad_norm": 21.902114868164062, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": -1.2936923503875732, + "logits/rejected": -1.1569340229034424, + "logps/chosen": -576.9588623046875, + "logps/rejected": -705.35791015625, + "loss": 0.4632, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0516982078552246, + "rewards/margins": 1.4037137031555176, + "rewards/rejected": -4.4554123878479, + "step": 2660 + }, + { + "epoch": 0.6987699555090291, + "grad_norm": 12.999093055725098, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": -1.2631757259368896, + "logits/rejected": -1.1437580585479736, + "logps/chosen": -550.2919921875, + "logps/rejected": -659.2994384765625, + "loss": 0.4911, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9243297576904297, + "rewards/margins": 1.3196732997894287, + "rewards/rejected": -4.244002819061279, + "step": 2670 + }, + { + "epoch": 0.7013870714472651, + "grad_norm": 15.106201171875, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": -1.2861721515655518, + "logits/rejected": -1.1344573497772217, + "logps/chosen": -562.5394287109375, + "logps/rejected": -647.7611083984375, + "loss": 0.5272, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9545741081237793, + "rewards/margins": 1.1845426559448242, + "rewards/rejected": -4.1391167640686035, + "step": 2680 + }, + { + "epoch": 0.7040041873855012, + "grad_norm": 9.087705612182617, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": -1.3246266841888428, + "logits/rejected": -1.2674678564071655, + "logps/chosen": -540.13037109375, + "logps/rejected": -657.595947265625, + "loss": 0.492, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7469382286071777, + "rewards/margins": 1.2162708044052124, + "rewards/rejected": -3.963209629058838, + "step": 2690 + }, + { + "epoch": 0.7066213033237373, + "grad_norm": 18.654743194580078, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": -1.3834110498428345, + "logits/rejected": -1.2643133401870728, + "logps/chosen": -513.6646728515625, + "logps/rejected": -604.9307250976562, + "loss": 0.556, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.4955801963806152, + "rewards/margins": 1.1764256954193115, + "rewards/rejected": -3.6720058917999268, + "step": 2700 + }, + { + "epoch": 0.7066213033237373, + "eval_logits/chosen": -1.2295472621917725, + "eval_logits/rejected": -1.1003633737564087, + "eval_logps/chosen": -498.9264831542969, + "eval_logps/rejected": -595.6959228515625, + "eval_loss": 0.4841165840625763, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -2.343245029449463, + "eval_rewards/margins": 1.1680108308792114, + "eval_rewards/rejected": -3.5112557411193848, + "eval_runtime": 1597.0057, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 2700 + }, + { + "epoch": 0.7092384192619733, + "grad_norm": 19.703975677490234, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": -1.3118432760238647, + "logits/rejected": -1.2396559715270996, + "logps/chosen": -480.10662841796875, + "logps/rejected": -590.1580200195312, + "loss": 0.4967, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1435210704803467, + "rewards/margins": 1.1070889234542847, + "rewards/rejected": -3.250609874725342, + "step": 2710 + }, + { + "epoch": 0.7118555352002094, + "grad_norm": 5.974539279937744, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": -1.4100855588912964, + "logits/rejected": -1.2877388000488281, + "logps/chosen": -500.7962951660156, + "logps/rejected": -562.1715087890625, + "loss": 0.5414, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.097153902053833, + "rewards/margins": 0.9155322909355164, + "rewards/rejected": -3.012686014175415, + "step": 2720 + }, + { + "epoch": 0.7144726511384454, + "grad_norm": 9.851229667663574, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": -1.3525193929672241, + "logits/rejected": -1.19950532913208, + "logps/chosen": -445.181884765625, + "logps/rejected": -556.6793212890625, + "loss": 0.3931, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.911560297012329, + "rewards/margins": 1.5294151306152344, + "rewards/rejected": -3.4409751892089844, + "step": 2730 + }, + { + "epoch": 0.7170897670766815, + "grad_norm": 9.149617195129395, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": -1.3333719968795776, + "logits/rejected": -1.2258195877075195, + "logps/chosen": -494.1949157714844, + "logps/rejected": -600.24658203125, + "loss": 0.4166, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.106128692626953, + "rewards/margins": 1.385012149810791, + "rewards/rejected": -3.491140842437744, + "step": 2740 + }, + { + "epoch": 0.7197068830149176, + "grad_norm": 7.582621097564697, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": -1.3422666788101196, + "logits/rejected": -1.2579666376113892, + "logps/chosen": -470.9822692871094, + "logps/rejected": -587.7799072265625, + "loss": 0.5163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.159165620803833, + "rewards/margins": 1.106806993484497, + "rewards/rejected": -3.26597261428833, + "step": 2750 + }, + { + "epoch": 0.7223239989531536, + "grad_norm": 12.026503562927246, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": -1.3507201671600342, + "logits/rejected": -1.1788911819458008, + "logps/chosen": -452.8814392089844, + "logps/rejected": -557.76416015625, + "loss": 0.4254, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.065864324569702, + "rewards/margins": 1.3924720287322998, + "rewards/rejected": -3.458336353302002, + "step": 2760 + }, + { + "epoch": 0.7249411148913897, + "grad_norm": 16.423751831054688, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": -1.3033778667449951, + "logits/rejected": -1.1522514820098877, + "logps/chosen": -498.0997619628906, + "logps/rejected": -590.888427734375, + "loss": 0.4935, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2554616928100586, + "rewards/margins": 1.2149760723114014, + "rewards/rejected": -3.470437526702881, + "step": 2770 + }, + { + "epoch": 0.7275582308296258, + "grad_norm": 8.218092918395996, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": -1.2994598150253296, + "logits/rejected": -1.1608821153640747, + "logps/chosen": -527.0745849609375, + "logps/rejected": -638.7930908203125, + "loss": 0.4518, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3942160606384277, + "rewards/margins": 1.3985862731933594, + "rewards/rejected": -3.792802333831787, + "step": 2780 + }, + { + "epoch": 0.7301753467678618, + "grad_norm": 12.542786598205566, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": -1.2890033721923828, + "logits/rejected": -1.1566731929779053, + "logps/chosen": -490.1463928222656, + "logps/rejected": -611.6651611328125, + "loss": 0.4472, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.296391010284424, + "rewards/margins": 1.492492437362671, + "rewards/rejected": -3.7888832092285156, + "step": 2790 + }, + { + "epoch": 0.7327924627060979, + "grad_norm": 10.247793197631836, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": -1.329484224319458, + "logits/rejected": -1.2037220001220703, + "logps/chosen": -503.18951416015625, + "logps/rejected": -605.0428466796875, + "loss": 0.4617, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.380659580230713, + "rewards/margins": 1.1856248378753662, + "rewards/rejected": -3.5662841796875, + "step": 2800 + }, + { + "epoch": 0.7327924627060979, + "eval_logits/chosen": -1.195982813835144, + "eval_logits/rejected": -1.0626633167266846, + "eval_logps/chosen": -499.5495910644531, + "eval_logps/rejected": -606.4032592773438, + "eval_loss": 0.4832090735435486, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -2.3494763374328613, + "eval_rewards/margins": 1.2688524723052979, + "eval_rewards/rejected": -3.61832857131958, + "eval_runtime": 1596.6931, + "eval_samples_per_second": 1.253, + "eval_steps_per_second": 0.157, + "step": 2800 + }, + { + "epoch": 0.735409578644334, + "grad_norm": 20.173498153686523, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": -1.2981245517730713, + "logits/rejected": -1.1898633241653442, + "logps/chosen": -514.4564819335938, + "logps/rejected": -633.3460693359375, + "loss": 0.5311, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5339112281799316, + "rewards/margins": 1.2178757190704346, + "rewards/rejected": -3.751786708831787, + "step": 2810 + }, + { + "epoch": 0.73802669458257, + "grad_norm": 20.3377742767334, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": -1.2338041067123413, + "logits/rejected": -1.084707498550415, + "logps/chosen": -491.3687438964844, + "logps/rejected": -575.6991577148438, + "loss": 0.5842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5079922676086426, + "rewards/margins": 0.9769840240478516, + "rewards/rejected": -3.484976291656494, + "step": 2820 + }, + { + "epoch": 0.7406438105208061, + "grad_norm": 10.66334056854248, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": -1.2951042652130127, + "logits/rejected": -1.1750590801239014, + "logps/chosen": -511.2039489746094, + "logps/rejected": -602.0439453125, + "loss": 0.4889, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.499505043029785, + "rewards/margins": 1.1682642698287964, + "rewards/rejected": -3.66776967048645, + "step": 2830 + }, + { + "epoch": 0.7432609264590422, + "grad_norm": 8.688248634338379, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": -1.371790885925293, + "logits/rejected": -1.270684003829956, + "logps/chosen": -503.0506896972656, + "logps/rejected": -553.471923828125, + "loss": 0.5805, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.423825979232788, + "rewards/margins": 0.8341992497444153, + "rewards/rejected": -3.2580254077911377, + "step": 2840 + }, + { + "epoch": 0.7458780423972782, + "grad_norm": 13.514219284057617, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": -1.2997102737426758, + "logits/rejected": -1.1494895219802856, + "logps/chosen": -497.5753479003906, + "logps/rejected": -604.2395629882812, + "loss": 0.4702, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2467424869537354, + "rewards/margins": 1.2961435317993164, + "rewards/rejected": -3.542886257171631, + "step": 2850 + }, + { + "epoch": 0.7484951583355143, + "grad_norm": 10.475621223449707, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": -1.2712374925613403, + "logits/rejected": -1.236365795135498, + "logps/chosen": -477.6539611816406, + "logps/rejected": -613.723876953125, + "loss": 0.4366, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.210069179534912, + "rewards/margins": 1.3023548126220703, + "rewards/rejected": -3.5124244689941406, + "step": 2860 + }, + { + "epoch": 0.7511122742737504, + "grad_norm": 11.980389595031738, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": -1.3676173686981201, + "logits/rejected": -1.303006887435913, + "logps/chosen": -500.90008544921875, + "logps/rejected": -591.3121948242188, + "loss": 0.5046, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3170719146728516, + "rewards/margins": 0.9840759038925171, + "rewards/rejected": -3.3011481761932373, + "step": 2870 + }, + { + "epoch": 0.7537293902119864, + "grad_norm": 8.44219970703125, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": -1.323055624961853, + "logits/rejected": -1.1673837900161743, + "logps/chosen": -512.1810302734375, + "logps/rejected": -617.2755126953125, + "loss": 0.4958, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.588519811630249, + "rewards/margins": 1.3053323030471802, + "rewards/rejected": -3.8938522338867188, + "step": 2880 + }, + { + "epoch": 0.7563465061502225, + "grad_norm": 6.950828552246094, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": -1.3387699127197266, + "logits/rejected": -1.2082509994506836, + "logps/chosen": -511.2001953125, + "logps/rejected": -603.291748046875, + "loss": 0.4873, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6037611961364746, + "rewards/margins": 1.2057433128356934, + "rewards/rejected": -3.809504747390747, + "step": 2890 + }, + { + "epoch": 0.7589636220884585, + "grad_norm": 7.15659236907959, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": -1.2071359157562256, + "logits/rejected": -1.0866135358810425, + "logps/chosen": -522.898193359375, + "logps/rejected": -642.6585083007812, + "loss": 0.4916, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6741576194763184, + "rewards/margins": 1.2413873672485352, + "rewards/rejected": -3.9155445098876953, + "step": 2900 + }, + { + "epoch": 0.7589636220884585, + "eval_logits/chosen": -1.1417629718780518, + "eval_logits/rejected": -1.0031887292861938, + "eval_logps/chosen": -531.7142333984375, + "eval_logps/rejected": -636.2195434570312, + "eval_loss": 0.4799574017524719, + "eval_rewards/accuracies": 0.7455000281333923, + "eval_rewards/chosen": -2.6711227893829346, + "eval_rewards/margins": 1.2453694343566895, + "eval_rewards/rejected": -3.916492223739624, + "eval_runtime": 1597.0819, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 2900 + }, + { + "epoch": 0.7615807380266946, + "grad_norm": 8.567606925964355, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": -1.228169560432434, + "logits/rejected": -1.035468339920044, + "logps/chosen": -499.9391174316406, + "logps/rejected": -596.357666015625, + "loss": 0.483, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.584282398223877, + "rewards/margins": 1.2857377529144287, + "rewards/rejected": -3.8700199127197266, + "step": 2910 + }, + { + "epoch": 0.7641978539649307, + "grad_norm": 8.268507957458496, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": -1.2574290037155151, + "logits/rejected": -1.1231721639633179, + "logps/chosen": -538.8193969726562, + "logps/rejected": -632.8656005859375, + "loss": 0.5454, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.62135648727417, + "rewards/margins": 1.0157769918441772, + "rewards/rejected": -3.637133836746216, + "step": 2920 + }, + { + "epoch": 0.7668149699031667, + "grad_norm": 6.73305606842041, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": -1.3410282135009766, + "logits/rejected": -1.1771111488342285, + "logps/chosen": -512.6897583007812, + "logps/rejected": -595.9611206054688, + "loss": 0.5007, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4339559078216553, + "rewards/margins": 1.0879731178283691, + "rewards/rejected": -3.5219292640686035, + "step": 2930 + }, + { + "epoch": 0.7694320858414028, + "grad_norm": 8.820892333984375, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": -1.195052981376648, + "logits/rejected": -1.0439643859863281, + "logps/chosen": -517.55810546875, + "logps/rejected": -611.39111328125, + "loss": 0.4715, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5697178840637207, + "rewards/margins": 1.3570950031280518, + "rewards/rejected": -3.9268131256103516, + "step": 2940 + }, + { + "epoch": 0.7720492017796389, + "grad_norm": 10.797639846801758, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": -1.2065623998641968, + "logits/rejected": -1.0761216878890991, + "logps/chosen": -521.6849365234375, + "logps/rejected": -586.7667236328125, + "loss": 0.5756, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6908392906188965, + "rewards/margins": 0.898908257484436, + "rewards/rejected": -3.589747667312622, + "step": 2950 + }, + { + "epoch": 0.7746663177178749, + "grad_norm": 9.844982147216797, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": -1.219063639640808, + "logits/rejected": -1.133622407913208, + "logps/chosen": -516.6390991210938, + "logps/rejected": -624.2025146484375, + "loss": 0.4668, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.544440269470215, + "rewards/margins": 1.317181944847107, + "rewards/rejected": -3.8616223335266113, + "step": 2960 + }, + { + "epoch": 0.777283433656111, + "grad_norm": 12.88412094116211, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": -1.269226312637329, + "logits/rejected": -1.0988438129425049, + "logps/chosen": -547.8538208007812, + "logps/rejected": -646.4948120117188, + "loss": 0.4824, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6517109870910645, + "rewards/margins": 1.145399570465088, + "rewards/rejected": -3.7971103191375732, + "step": 2970 + }, + { + "epoch": 0.7799005495943471, + "grad_norm": 8.924067497253418, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": -1.2982590198516846, + "logits/rejected": -1.1203540563583374, + "logps/chosen": -504.6836853027344, + "logps/rejected": -623.8721313476562, + "loss": 0.4545, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5236408710479736, + "rewards/margins": 1.3211889266967773, + "rewards/rejected": -3.844829559326172, + "step": 2980 + }, + { + "epoch": 0.7825176655325831, + "grad_norm": 6.5459513664245605, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": -1.3317945003509521, + "logits/rejected": -1.1817572116851807, + "logps/chosen": -483.4960021972656, + "logps/rejected": -588.3883056640625, + "loss": 0.4648, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.526824712753296, + "rewards/margins": 1.1909806728363037, + "rewards/rejected": -3.7178053855895996, + "step": 2990 + }, + { + "epoch": 0.7851347814708192, + "grad_norm": 8.477150917053223, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": -1.2142189741134644, + "logits/rejected": -1.1248340606689453, + "logps/chosen": -488.151123046875, + "logps/rejected": -618.1202392578125, + "loss": 0.4708, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5399701595306396, + "rewards/margins": 1.251422643661499, + "rewards/rejected": -3.7913928031921387, + "step": 3000 + }, + { + "epoch": 0.7851347814708192, + "eval_logits/chosen": -1.1355363130569458, + "eval_logits/rejected": -0.9962058663368225, + "eval_logps/chosen": -526.2620849609375, + "eval_logps/rejected": -623.4008178710938, + "eval_loss": 0.4796713590621948, + "eval_rewards/accuracies": 0.7475000023841858, + "eval_rewards/chosen": -2.61660099029541, + "eval_rewards/margins": 1.1717036962509155, + "eval_rewards/rejected": -3.788304328918457, + "eval_runtime": 1596.8532, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 3000 + }, + { + "epoch": 0.7877518974090553, + "grad_norm": 9.99060344696045, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": -1.2248764038085938, + "logits/rejected": -1.1362669467926025, + "logps/chosen": -536.1866455078125, + "logps/rejected": -627.3114013671875, + "loss": 0.532, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6758596897125244, + "rewards/margins": 1.0958218574523926, + "rewards/rejected": -3.771681308746338, + "step": 3010 + }, + { + "epoch": 0.7903690133472913, + "grad_norm": 10.53675365447998, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": -1.1557037830352783, + "logits/rejected": -0.9910370111465454, + "logps/chosen": -491.5957946777344, + "logps/rejected": -585.2208251953125, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6533892154693604, + "rewards/margins": 1.1808980703353882, + "rewards/rejected": -3.834287166595459, + "step": 3020 + }, + { + "epoch": 0.7929861292855274, + "grad_norm": 12.331343650817871, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": -1.2338765859603882, + "logits/rejected": -1.136103630065918, + "logps/chosen": -477.06243896484375, + "logps/rejected": -573.8302001953125, + "loss": 0.4934, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4373462200164795, + "rewards/margins": 1.100510835647583, + "rewards/rejected": -3.5378570556640625, + "step": 3030 + }, + { + "epoch": 0.7956032452237635, + "grad_norm": 6.849348545074463, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": -1.4195866584777832, + "logits/rejected": -1.1471283435821533, + "logps/chosen": -517.2559814453125, + "logps/rejected": -562.2097778320312, + "loss": 0.4849, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.4172825813293457, + "rewards/margins": 1.1046142578125, + "rewards/rejected": -3.5218968391418457, + "step": 3040 + }, + { + "epoch": 0.7982203611619995, + "grad_norm": 7.553493022918701, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": -1.2353808879852295, + "logits/rejected": -1.117495059967041, + "logps/chosen": -466.36297607421875, + "logps/rejected": -601.0093994140625, + "loss": 0.4162, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.381345272064209, + "rewards/margins": 1.3122522830963135, + "rewards/rejected": -3.6935970783233643, + "step": 3050 + }, + { + "epoch": 0.8008374771002356, + "grad_norm": 9.807490348815918, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": -1.1880605220794678, + "logits/rejected": -1.173678994178772, + "logps/chosen": -516.3980102539062, + "logps/rejected": -701.3677978515625, + "loss": 0.4261, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.5393073558807373, + "rewards/margins": 1.4251797199249268, + "rewards/rejected": -3.9644875526428223, + "step": 3060 + }, + { + "epoch": 0.8034545930384716, + "grad_norm": 8.029556274414062, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": -1.3551298379898071, + "logits/rejected": -1.2928274869918823, + "logps/chosen": -548.3309326171875, + "logps/rejected": -636.1687622070312, + "loss": 0.5164, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.670652389526367, + "rewards/margins": 1.083081603050232, + "rewards/rejected": -3.7537341117858887, + "step": 3070 + }, + { + "epoch": 0.8060717089767077, + "grad_norm": 10.774998664855957, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": -1.3124372959136963, + "logits/rejected": -1.1519577503204346, + "logps/chosen": -552.2019653320312, + "logps/rejected": -666.1383666992188, + "loss": 0.4952, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9138247966766357, + "rewards/margins": 1.403515338897705, + "rewards/rejected": -4.31734037399292, + "step": 3080 + }, + { + "epoch": 0.8086888249149438, + "grad_norm": 6.534247398376465, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": -1.3402431011199951, + "logits/rejected": -1.147871732711792, + "logps/chosen": -560.8900756835938, + "logps/rejected": -637.8880004882812, + "loss": 0.4401, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8420605659484863, + "rewards/margins": 1.2714219093322754, + "rewards/rejected": -4.113482475280762, + "step": 3090 + }, + { + "epoch": 0.8113059408531798, + "grad_norm": 7.741410732269287, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": -1.2486029863357544, + "logits/rejected": -1.1143968105316162, + "logps/chosen": -550.2741088867188, + "logps/rejected": -650.1898803710938, + "loss": 0.4804, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.740609645843506, + "rewards/margins": 1.2194688320159912, + "rewards/rejected": -3.960078001022339, + "step": 3100 + }, + { + "epoch": 0.8113059408531798, + "eval_logits/chosen": -1.1341181993484497, + "eval_logits/rejected": -0.9953464865684509, + "eval_logps/chosen": -546.843505859375, + "eval_logps/rejected": -656.7727661132812, + "eval_loss": 0.4807169735431671, + "eval_rewards/accuracies": 0.7475000023841858, + "eval_rewards/chosen": -2.822416067123413, + "eval_rewards/margins": 1.2996082305908203, + "eval_rewards/rejected": -4.122024059295654, + "eval_runtime": 1596.5373, + "eval_samples_per_second": 1.253, + "eval_steps_per_second": 0.157, + "step": 3100 + }, + { + "epoch": 0.8139230567914159, + "grad_norm": 12.117105484008789, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": -1.3339240550994873, + "logits/rejected": -1.1453027725219727, + "logps/chosen": -560.3875122070312, + "logps/rejected": -672.76513671875, + "loss": 0.5239, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.750267267227173, + "rewards/margins": 1.282500982284546, + "rewards/rejected": -4.032768249511719, + "step": 3110 + }, + { + "epoch": 0.816540172729652, + "grad_norm": 9.4435396194458, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": -1.2970499992370605, + "logits/rejected": -1.1291049718856812, + "logps/chosen": -554.2510986328125, + "logps/rejected": -649.1993408203125, + "loss": 0.5269, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.851862668991089, + "rewards/margins": 1.2229183912277222, + "rewards/rejected": -4.07478141784668, + "step": 3120 + }, + { + "epoch": 0.819157288667888, + "grad_norm": 14.918078422546387, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": -1.207421064376831, + "logits/rejected": -1.115781307220459, + "logps/chosen": -536.1851806640625, + "logps/rejected": -655.1953735351562, + "loss": 0.5314, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8535518646240234, + "rewards/margins": 1.2501590251922607, + "rewards/rejected": -4.103711128234863, + "step": 3130 + }, + { + "epoch": 0.821774404606124, + "grad_norm": 14.224996566772461, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": -1.1797192096710205, + "logits/rejected": -0.9966877102851868, + "logps/chosen": -504.07745361328125, + "logps/rejected": -650.1423950195312, + "loss": 0.4351, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6490511894226074, + "rewards/margins": 1.4823657274246216, + "rewards/rejected": -4.131417274475098, + "step": 3140 + }, + { + "epoch": 0.8243915205443602, + "grad_norm": 7.642600059509277, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": -1.366541862487793, + "logits/rejected": -1.1694148778915405, + "logps/chosen": -543.9080200195312, + "logps/rejected": -658.1217041015625, + "loss": 0.5434, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8025176525115967, + "rewards/margins": 1.2542707920074463, + "rewards/rejected": -4.056788444519043, + "step": 3150 + }, + { + "epoch": 0.8270086364825961, + "grad_norm": 10.804971694946289, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": -1.1909892559051514, + "logits/rejected": -1.0947834253311157, + "logps/chosen": -546.0935668945312, + "logps/rejected": -647.5870361328125, + "loss": 0.5018, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.70656156539917, + "rewards/margins": 1.1345676183700562, + "rewards/rejected": -3.8411293029785156, + "step": 3160 + }, + { + "epoch": 0.8296257524208323, + "grad_norm": 6.038959980010986, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": -1.1933271884918213, + "logits/rejected": -0.9797855615615845, + "logps/chosen": -524.9500732421875, + "logps/rejected": -634.7277221679688, + "loss": 0.4177, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.5928843021392822, + "rewards/margins": 1.4146864414215088, + "rewards/rejected": -4.007569789886475, + "step": 3170 + }, + { + "epoch": 0.8322428683590684, + "grad_norm": 8.643757820129395, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": -1.3083521127700806, + "logits/rejected": -1.2275665998458862, + "logps/chosen": -480.93267822265625, + "logps/rejected": -616.6661376953125, + "loss": 0.4995, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.5809030532836914, + "rewards/margins": 1.1199270486831665, + "rewards/rejected": -3.7008299827575684, + "step": 3180 + }, + { + "epoch": 0.8348599842973043, + "grad_norm": 5.735963821411133, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": -1.306793451309204, + "logits/rejected": -1.2211034297943115, + "logps/chosen": -528.7879638671875, + "logps/rejected": -633.7890625, + "loss": 0.5016, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.649538040161133, + "rewards/margins": 1.1934170722961426, + "rewards/rejected": -3.8429553508758545, + "step": 3190 + }, + { + "epoch": 0.8374771002355405, + "grad_norm": 11.755797386169434, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": -1.2019577026367188, + "logits/rejected": -1.1839076280593872, + "logps/chosen": -523.3551025390625, + "logps/rejected": -663.1622314453125, + "loss": 0.4866, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.657108783721924, + "rewards/margins": 1.273402452468872, + "rewards/rejected": -3.930511474609375, + "step": 3200 + }, + { + "epoch": 0.8374771002355405, + "eval_logits/chosen": -1.1640751361846924, + "eval_logits/rejected": -1.0276466608047485, + "eval_logps/chosen": -519.5614013671875, + "eval_logps/rejected": -623.51025390625, + "eval_loss": 0.4776689112186432, + "eval_rewards/accuracies": 0.7475000023841858, + "eval_rewards/chosen": -2.5495944023132324, + "eval_rewards/margins": 1.2398039102554321, + "eval_rewards/rejected": -3.789398431777954, + "eval_runtime": 1596.1554, + "eval_samples_per_second": 1.253, + "eval_steps_per_second": 0.157, + "step": 3200 + }, + { + "epoch": 0.8400942161737766, + "grad_norm": 6.5980448722839355, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": -1.2055485248565674, + "logits/rejected": -1.1355664730072021, + "logps/chosen": -506.8805236816406, + "logps/rejected": -596.1102294921875, + "loss": 0.503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4550302028656006, + "rewards/margins": 1.1328961849212646, + "rewards/rejected": -3.5879263877868652, + "step": 3210 + }, + { + "epoch": 0.8427113321120125, + "grad_norm": 7.025763034820557, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": -1.2890058755874634, + "logits/rejected": -1.1607505083084106, + "logps/chosen": -542.5242919921875, + "logps/rejected": -612.6026611328125, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.488643169403076, + "rewards/margins": 1.1322394609451294, + "rewards/rejected": -3.620882749557495, + "step": 3220 + }, + { + "epoch": 0.8453284480502486, + "grad_norm": 11.517923355102539, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": -1.2237865924835205, + "logits/rejected": -1.122482419013977, + "logps/chosen": -517.73974609375, + "logps/rejected": -605.0042114257812, + "loss": 0.513, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5605525970458984, + "rewards/margins": 0.999243438243866, + "rewards/rejected": -3.5597965717315674, + "step": 3230 + }, + { + "epoch": 0.8479455639884846, + "grad_norm": 10.88183307647705, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": -1.4292676448822021, + "logits/rejected": -1.2410900592803955, + "logps/chosen": -495.89813232421875, + "logps/rejected": -589.0921630859375, + "loss": 0.4351, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3068690299987793, + "rewards/margins": 1.3076789379119873, + "rewards/rejected": -3.6145482063293457, + "step": 3240 + }, + { + "epoch": 0.8505626799267207, + "grad_norm": 11.31247615814209, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": -1.3701411485671997, + "logits/rejected": -1.213030457496643, + "logps/chosen": -523.7352294921875, + "logps/rejected": -627.9140625, + "loss": 0.433, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.305351495742798, + "rewards/margins": 1.2927907705307007, + "rewards/rejected": -3.598142147064209, + "step": 3250 + }, + { + "epoch": 0.8531797958649568, + "grad_norm": 15.092782974243164, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": -1.2755637168884277, + "logits/rejected": -1.1552997827529907, + "logps/chosen": -517.78369140625, + "logps/rejected": -615.5398559570312, + "loss": 0.5176, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.607300281524658, + "rewards/margins": 1.1482493877410889, + "rewards/rejected": -3.755549669265747, + "step": 3260 + }, + { + "epoch": 0.8557969118031928, + "grad_norm": 18.888704299926758, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": -1.3271687030792236, + "logits/rejected": -1.2444849014282227, + "logps/chosen": -477.8196716308594, + "logps/rejected": -587.2836303710938, + "loss": 0.4719, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.301666498184204, + "rewards/margins": 1.2528338432312012, + "rewards/rejected": -3.554500102996826, + "step": 3270 + }, + { + "epoch": 0.8584140277414289, + "grad_norm": 10.203365325927734, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": -1.245596170425415, + "logits/rejected": -1.222507357597351, + "logps/chosen": -513.1766357421875, + "logps/rejected": -632.6079711914062, + "loss": 0.4915, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.386977434158325, + "rewards/margins": 1.1650269031524658, + "rewards/rejected": -3.55200457572937, + "step": 3280 + }, + { + "epoch": 0.861031143679665, + "grad_norm": 4.975100040435791, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": -1.2813748121261597, + "logits/rejected": -1.2710316181182861, + "logps/chosen": -519.4554443359375, + "logps/rejected": -658.6768188476562, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.585131883621216, + "rewards/margins": 1.4114031791687012, + "rewards/rejected": -3.996534824371338, + "step": 3290 + }, + { + "epoch": 0.863648259617901, + "grad_norm": 6.192923069000244, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": -1.268638014793396, + "logits/rejected": -1.1421396732330322, + "logps/chosen": -508.80584716796875, + "logps/rejected": -629.664306640625, + "loss": 0.4967, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.516002893447876, + "rewards/margins": 1.2464697360992432, + "rewards/rejected": -3.762472629547119, + "step": 3300 + }, + { + "epoch": 0.863648259617901, + "eval_logits/chosen": -1.1608073711395264, + "eval_logits/rejected": -1.0241122245788574, + "eval_logps/chosen": -520.3804321289062, + "eval_logps/rejected": -625.6535034179688, + "eval_loss": 0.47857987880706787, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -2.5577852725982666, + "eval_rewards/margins": 1.2530462741851807, + "eval_rewards/rejected": -3.8108315467834473, + "eval_runtime": 1596.9029, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 3300 + }, + { + "epoch": 0.8662653755561371, + "grad_norm": 14.262938499450684, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": -1.2763694524765015, + "logits/rejected": -1.12501859664917, + "logps/chosen": -479.6934509277344, + "logps/rejected": -549.134765625, + "loss": 0.5232, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4996352195739746, + "rewards/margins": 1.1307785511016846, + "rewards/rejected": -3.630413770675659, + "step": 3310 + }, + { + "epoch": 0.8688824914943732, + "grad_norm": 6.411558628082275, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": -1.277998447418213, + "logits/rejected": -1.1551088094711304, + "logps/chosen": -519.344482421875, + "logps/rejected": -614.8687744140625, + "loss": 0.5019, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5514492988586426, + "rewards/margins": 1.2041881084442139, + "rewards/rejected": -3.7556369304656982, + "step": 3320 + }, + { + "epoch": 0.8714996074326092, + "grad_norm": 8.243268966674805, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": -1.444610834121704, + "logits/rejected": -1.2527801990509033, + "logps/chosen": -515.1294555664062, + "logps/rejected": -632.4005737304688, + "loss": 0.4532, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.376555919647217, + "rewards/margins": 1.4679479598999023, + "rewards/rejected": -3.844503879547119, + "step": 3330 + }, + { + "epoch": 0.8741167233708453, + "grad_norm": 10.0430908203125, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": -1.2998971939086914, + "logits/rejected": -1.1857765913009644, + "logps/chosen": -535.6951293945312, + "logps/rejected": -634.7950439453125, + "loss": 0.495, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6694915294647217, + "rewards/margins": 1.0715022087097168, + "rewards/rejected": -3.7409939765930176, + "step": 3340 + }, + { + "epoch": 0.8767338393090814, + "grad_norm": 9.759672164916992, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": -1.2516978979110718, + "logits/rejected": -1.143761396408081, + "logps/chosen": -506.92779541015625, + "logps/rejected": -630.4886474609375, + "loss": 0.4682, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4124372005462646, + "rewards/margins": 1.3180114030838013, + "rewards/rejected": -3.7304489612579346, + "step": 3350 + }, + { + "epoch": 0.8793509552473174, + "grad_norm": 10.614029884338379, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": -1.3051875829696655, + "logits/rejected": -1.0945546627044678, + "logps/chosen": -524.2318115234375, + "logps/rejected": -672.577880859375, + "loss": 0.4244, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.515920639038086, + "rewards/margins": 1.5423296689987183, + "rewards/rejected": -4.058249473571777, + "step": 3360 + }, + { + "epoch": 0.8819680711855535, + "grad_norm": 7.248552322387695, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": -1.2330162525177002, + "logits/rejected": -1.0763533115386963, + "logps/chosen": -503.97393798828125, + "logps/rejected": -626.6051025390625, + "loss": 0.4239, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.638221025466919, + "rewards/margins": 1.4292933940887451, + "rewards/rejected": -4.067514419555664, + "step": 3370 + }, + { + "epoch": 0.8845851871237895, + "grad_norm": 15.31811237335205, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": -1.244128942489624, + "logits/rejected": -1.1012144088745117, + "logps/chosen": -522.0372924804688, + "logps/rejected": -629.9783325195312, + "loss": 0.4693, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.676211357116699, + "rewards/margins": 1.2609608173370361, + "rewards/rejected": -3.9371724128723145, + "step": 3380 + }, + { + "epoch": 0.8872023030620256, + "grad_norm": 5.824941635131836, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": -1.3020130395889282, + "logits/rejected": -1.055781602859497, + "logps/chosen": -542.0621337890625, + "logps/rejected": -616.4013061523438, + "loss": 0.4735, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7002296447753906, + "rewards/margins": 1.2402583360671997, + "rewards/rejected": -3.9404876232147217, + "step": 3390 + }, + { + "epoch": 0.8898194190002617, + "grad_norm": 9.155735969543457, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": -1.3028671741485596, + "logits/rejected": -1.1115076541900635, + "logps/chosen": -565.5799560546875, + "logps/rejected": -653.9213256835938, + "loss": 0.4272, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.5059080123901367, + "rewards/margins": 1.455091118812561, + "rewards/rejected": -3.960999011993408, + "step": 3400 + }, + { + "epoch": 0.8898194190002617, + "eval_logits/chosen": -1.1445426940917969, + "eval_logits/rejected": -1.0071464776992798, + "eval_logps/chosen": -536.8281860351562, + "eval_logps/rejected": -647.4435424804688, + "eval_loss": 0.47965455055236816, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -2.722262382507324, + "eval_rewards/margins": 1.306469440460205, + "eval_rewards/rejected": -4.0287322998046875, + "eval_runtime": 1597.1181, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.157, + "step": 3400 + }, + { + "epoch": 0.8924365349384977, + "grad_norm": 9.588942527770996, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": -1.2525078058242798, + "logits/rejected": -1.0694096088409424, + "logps/chosen": -553.6488037109375, + "logps/rejected": -648.3309326171875, + "loss": 0.4282, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.723491668701172, + "rewards/margins": 1.380472183227539, + "rewards/rejected": -4.103963851928711, + "step": 3410 + }, + { + "epoch": 0.8950536508767338, + "grad_norm": 19.193740844726562, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": -1.3545329570770264, + "logits/rejected": -1.1915156841278076, + "logps/chosen": -562.5875244140625, + "logps/rejected": -653.5098876953125, + "loss": 0.4928, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.815075635910034, + "rewards/margins": 1.2605375051498413, + "rewards/rejected": -4.075612545013428, + "step": 3420 + }, + { + "epoch": 0.8976707668149699, + "grad_norm": 14.299762725830078, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": -1.283125638961792, + "logits/rejected": -1.0879403352737427, + "logps/chosen": -551.2545166015625, + "logps/rejected": -682.09228515625, + "loss": 0.4447, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7663490772247314, + "rewards/margins": 1.4793423414230347, + "rewards/rejected": -4.245691776275635, + "step": 3430 + }, + { + "epoch": 0.9002878827532059, + "grad_norm": 8.79478645324707, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": -1.2685706615447998, + "logits/rejected": -1.1165311336517334, + "logps/chosen": -530.1332397460938, + "logps/rejected": -616.2289428710938, + "loss": 0.5132, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7463672161102295, + "rewards/margins": 1.2237727642059326, + "rewards/rejected": -3.970139980316162, + "step": 3440 + }, + { + "epoch": 0.902904998691442, + "grad_norm": 14.623788833618164, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": -1.298165202140808, + "logits/rejected": -1.205263614654541, + "logps/chosen": -537.5665283203125, + "logps/rejected": -645.2786865234375, + "loss": 0.4557, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.692805528640747, + "rewards/margins": 1.3347504138946533, + "rewards/rejected": -4.0275559425354, + "step": 3450 + }, + { + "epoch": 0.9055221146296781, + "grad_norm": 11.80216121673584, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": -1.2136515378952026, + "logits/rejected": -1.1657497882843018, + "logps/chosen": -525.5947265625, + "logps/rejected": -662.1110229492188, + "loss": 0.4657, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7938077449798584, + "rewards/margins": 1.385801076889038, + "rewards/rejected": -4.1796088218688965, + "step": 3460 + }, + { + "epoch": 0.9081392305679141, + "grad_norm": 9.422633171081543, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": -1.3240694999694824, + "logits/rejected": -1.106979489326477, + "logps/chosen": -565.2491455078125, + "logps/rejected": -643.6472778320312, + "loss": 0.4631, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6884024143218994, + "rewards/margins": 1.3586628437042236, + "rewards/rejected": -4.047064781188965, + "step": 3470 + }, + { + "epoch": 0.9107563465061502, + "grad_norm": 9.699939727783203, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": -1.306654691696167, + "logits/rejected": -1.1677879095077515, + "logps/chosen": -532.5252685546875, + "logps/rejected": -648.9597778320312, + "loss": 0.5443, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.886373519897461, + "rewards/margins": 1.1954319477081299, + "rewards/rejected": -4.081805229187012, + "step": 3480 + }, + { + "epoch": 0.9133734624443863, + "grad_norm": 11.356287002563477, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": -1.3395811319351196, + "logits/rejected": -1.1923797130584717, + "logps/chosen": -551.8304443359375, + "logps/rejected": -638.4985961914062, + "loss": 0.5142, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.699251651763916, + "rewards/margins": 1.195854902267456, + "rewards/rejected": -3.895106792449951, + "step": 3490 + }, + { + "epoch": 0.9159905783826223, + "grad_norm": 10.900007247924805, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": -1.306223750114441, + "logits/rejected": -1.088254690170288, + "logps/chosen": -554.8663330078125, + "logps/rejected": -634.9073486328125, + "loss": 0.5272, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7420237064361572, + "rewards/margins": 1.2511793375015259, + "rewards/rejected": -3.9932029247283936, + "step": 3500 + }, + { + "epoch": 0.9159905783826223, + "eval_logits/chosen": -1.160068154335022, + "eval_logits/rejected": -1.023296594619751, + "eval_logps/chosen": -536.0448608398438, + "eval_logps/rejected": -647.7730102539062, + "eval_loss": 0.4797233045101166, + "eval_rewards/accuracies": 0.746999979019165, + "eval_rewards/chosen": -2.7144289016723633, + "eval_rewards/margins": 1.3175978660583496, + "eval_rewards/rejected": -4.032026767730713, + "eval_runtime": 1597.7222, + "eval_samples_per_second": 1.252, + "eval_steps_per_second": 0.156, + "step": 3500 + }, + { + "epoch": 0.9186076943208584, + "grad_norm": 9.065542221069336, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": -1.2851347923278809, + "logits/rejected": -1.0965713262557983, + "logps/chosen": -514.8778076171875, + "logps/rejected": -634.3299560546875, + "loss": 0.4078, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.6071436405181885, + "rewards/margins": 1.371249794960022, + "rewards/rejected": -3.978393077850342, + "step": 3510 + }, + { + "epoch": 0.9212248102590945, + "grad_norm": 12.434161186218262, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": -1.3080555200576782, + "logits/rejected": -1.222429633140564, + "logps/chosen": -531.6901245117188, + "logps/rejected": -675.6932373046875, + "loss": 0.4615, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.632573366165161, + "rewards/margins": 1.43732750415802, + "rewards/rejected": -4.069900989532471, + "step": 3520 + }, + { + "epoch": 0.9238419261973305, + "grad_norm": 16.84272003173828, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": -1.243849515914917, + "logits/rejected": -1.1529252529144287, + "logps/chosen": -504.6124572753906, + "logps/rejected": -625.7620849609375, + "loss": 0.474, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7395050525665283, + "rewards/margins": 1.2195281982421875, + "rewards/rejected": -3.959033489227295, + "step": 3530 + }, + { + "epoch": 0.9264590421355666, + "grad_norm": 8.225701332092285, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": -1.348672866821289, + "logits/rejected": -1.1730903387069702, + "logps/chosen": -544.8364868164062, + "logps/rejected": -697.6771240234375, + "loss": 0.4144, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6183691024780273, + "rewards/margins": 1.5808098316192627, + "rewards/rejected": -4.199179172515869, + "step": 3540 + }, + { + "epoch": 0.9290761580738026, + "grad_norm": 11.679130554199219, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": -1.2883799076080322, + "logits/rejected": -1.2092903852462769, + "logps/chosen": -517.6767578125, + "logps/rejected": -647.9732666015625, + "loss": 0.4827, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6364307403564453, + "rewards/margins": 1.2941230535507202, + "rewards/rejected": -3.930554151535034, + "step": 3550 + }, + { + "epoch": 0.9316932740120387, + "grad_norm": 5.509209632873535, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": -1.283998727798462, + "logits/rejected": -1.1587374210357666, + "logps/chosen": -517.294677734375, + "logps/rejected": -615.6148071289062, + "loss": 0.5114, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6106066703796387, + "rewards/margins": 1.3990845680236816, + "rewards/rejected": -4.00969123840332, + "step": 3560 + }, + { + "epoch": 0.9343103899502748, + "grad_norm": 7.372361660003662, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": -1.3087977170944214, + "logits/rejected": -1.178120493888855, + "logps/chosen": -560.3231201171875, + "logps/rejected": -660.3911743164062, + "loss": 0.4774, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6826512813568115, + "rewards/margins": 1.2511831521987915, + "rewards/rejected": -3.9338345527648926, + "step": 3570 + }, + { + "epoch": 0.9369275058885108, + "grad_norm": 9.634334564208984, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": -1.3032505512237549, + "logits/rejected": -1.231890320777893, + "logps/chosen": -522.193603515625, + "logps/rejected": -652.8558349609375, + "loss": 0.4929, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5825421810150146, + "rewards/margins": 1.2584049701690674, + "rewards/rejected": -3.840946912765503, + "step": 3580 + }, + { + "epoch": 0.9395446218267469, + "grad_norm": 7.305212497711182, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": -1.2807940244674683, + "logits/rejected": -1.1494871377944946, + "logps/chosen": -517.2708740234375, + "logps/rejected": -632.7682495117188, + "loss": 0.4594, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.5563392639160156, + "rewards/margins": 1.3009912967681885, + "rewards/rejected": -3.857330799102783, + "step": 3590 + }, + { + "epoch": 0.942161737764983, + "grad_norm": 13.96353530883789, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": -1.198561429977417, + "logits/rejected": -1.0997329950332642, + "logps/chosen": -484.185546875, + "logps/rejected": -620.9610595703125, + "loss": 0.4441, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.587231397628784, + "rewards/margins": 1.3218923807144165, + "rewards/rejected": -3.909123659133911, + "step": 3600 + }, + { + "epoch": 0.942161737764983, + "eval_logits/chosen": -1.164100170135498, + "eval_logits/rejected": -1.0277760028839111, + "eval_logps/chosen": -529.1943969726562, + "eval_logps/rejected": -639.7042846679688, + "eval_loss": 0.4790266156196594, + "eval_rewards/accuracies": 0.746999979019165, + "eval_rewards/chosen": -2.6459240913391113, + "eval_rewards/margins": 1.3054152727127075, + "eval_rewards/rejected": -3.9513394832611084, + "eval_runtime": 1598.6642, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 3600 + }, + { + "epoch": 0.944778853703219, + "grad_norm": 9.906332969665527, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": -1.2350178956985474, + "logits/rejected": -1.0819157361984253, + "logps/chosen": -498.882568359375, + "logps/rejected": -640.2724609375, + "loss": 0.4371, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.635540008544922, + "rewards/margins": 1.536821722984314, + "rewards/rejected": -4.172361850738525, + "step": 3610 + }, + { + "epoch": 0.9473959696414551, + "grad_norm": 12.568668365478516, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": -1.2538177967071533, + "logits/rejected": -1.1405234336853027, + "logps/chosen": -525.0173950195312, + "logps/rejected": -613.74560546875, + "loss": 0.5461, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.771757125854492, + "rewards/margins": 1.1880303621292114, + "rewards/rejected": -3.959787368774414, + "step": 3620 + }, + { + "epoch": 0.9500130855796912, + "grad_norm": 8.876080513000488, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": -1.373834252357483, + "logits/rejected": -1.228161334991455, + "logps/chosen": -527.4785766601562, + "logps/rejected": -619.109375, + "loss": 0.4818, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6347904205322266, + "rewards/margins": 1.3071154356002808, + "rewards/rejected": -3.941905975341797, + "step": 3630 + }, + { + "epoch": 0.9526302015179272, + "grad_norm": 9.864373207092285, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": -1.29331636428833, + "logits/rejected": -1.077043056488037, + "logps/chosen": -534.7817993164062, + "logps/rejected": -597.4425048828125, + "loss": 0.4952, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6166446208953857, + "rewards/margins": 1.1832177639007568, + "rewards/rejected": -3.7998623847961426, + "step": 3640 + }, + { + "epoch": 0.9552473174561633, + "grad_norm": 12.852518081665039, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": -1.339290976524353, + "logits/rejected": -1.1780240535736084, + "logps/chosen": -494.443603515625, + "logps/rejected": -590.2618408203125, + "loss": 0.4506, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4245123863220215, + "rewards/margins": 1.2823518514633179, + "rewards/rejected": -3.70686411857605, + "step": 3650 + }, + { + "epoch": 0.9578644333943994, + "grad_norm": 9.657252311706543, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": -1.281280755996704, + "logits/rejected": -1.2413251399993896, + "logps/chosen": -539.2777099609375, + "logps/rejected": -645.9277954101562, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7454452514648438, + "rewards/margins": 1.0612398386001587, + "rewards/rejected": -3.8066844940185547, + "step": 3660 + }, + { + "epoch": 0.9604815493326354, + "grad_norm": 12.267367362976074, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": -1.3983352184295654, + "logits/rejected": -1.225462794303894, + "logps/chosen": -544.6994018554688, + "logps/rejected": -638.8689575195312, + "loss": 0.5249, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7232279777526855, + "rewards/margins": 1.2808904647827148, + "rewards/rejected": -4.0041184425354, + "step": 3670 + }, + { + "epoch": 0.9630986652708715, + "grad_norm": 13.94206714630127, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": -1.2447559833526611, + "logits/rejected": -1.2226978540420532, + "logps/chosen": -527.1612548828125, + "logps/rejected": -661.6229248046875, + "loss": 0.3918, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.6566879749298096, + "rewards/margins": 1.467563271522522, + "rewards/rejected": -4.124251365661621, + "step": 3680 + }, + { + "epoch": 0.9657157812091076, + "grad_norm": 8.289405822753906, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": -1.3621820211410522, + "logits/rejected": -1.1523287296295166, + "logps/chosen": -576.5960693359375, + "logps/rejected": -663.8076782226562, + "loss": 0.5052, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.597318172454834, + "rewards/margins": 1.3620169162750244, + "rewards/rejected": -3.9593348503112793, + "step": 3690 + }, + { + "epoch": 0.9683328971473436, + "grad_norm": 7.745994567871094, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": -1.1971657276153564, + "logits/rejected": -1.0677430629730225, + "logps/chosen": -573.20361328125, + "logps/rejected": -680.00244140625, + "loss": 0.4823, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7669663429260254, + "rewards/margins": 1.330165147781372, + "rewards/rejected": -4.097131729125977, + "step": 3700 + }, + { + "epoch": 0.9683328971473436, + "eval_logits/chosen": -1.1687482595443726, + "eval_logits/rejected": -1.0329276323318481, + "eval_logps/chosen": -527.3952026367188, + "eval_logps/rejected": -637.1880493164062, + "eval_loss": 0.47885680198669434, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -2.627932548522949, + "eval_rewards/margins": 1.298244595527649, + "eval_rewards/rejected": -3.9261767864227295, + "eval_runtime": 1598.696, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 3700 + }, + { + "epoch": 0.9709500130855797, + "grad_norm": 18.376056671142578, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": -1.182948112487793, + "logits/rejected": -1.10740065574646, + "logps/chosen": -496.62152099609375, + "logps/rejected": -624.2833862304688, + "loss": 0.4808, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.603543281555176, + "rewards/margins": 1.3529905080795288, + "rewards/rejected": -3.956533908843994, + "step": 3710 + }, + { + "epoch": 0.9735671290238157, + "grad_norm": 9.365938186645508, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": -1.1770192384719849, + "logits/rejected": -1.0035854578018188, + "logps/chosen": -516.5242309570312, + "logps/rejected": -607.8884887695312, + "loss": 0.5282, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6737780570983887, + "rewards/margins": 1.2736929655075073, + "rewards/rejected": -3.9474711418151855, + "step": 3720 + }, + { + "epoch": 0.9761842449620518, + "grad_norm": 11.24964427947998, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": -1.1972987651824951, + "logits/rejected": -1.0941554307937622, + "logps/chosen": -499.6304626464844, + "logps/rejected": -600.4740600585938, + "loss": 0.4853, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.5446014404296875, + "rewards/margins": 1.2359497547149658, + "rewards/rejected": -3.7805511951446533, + "step": 3730 + }, + { + "epoch": 0.9788013609002879, + "grad_norm": 10.543977737426758, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": -1.2653145790100098, + "logits/rejected": -1.1376771926879883, + "logps/chosen": -532.8594970703125, + "logps/rejected": -669.3706665039062, + "loss": 0.4524, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.608578681945801, + "rewards/margins": 1.4806853532791138, + "rewards/rejected": -4.089264392852783, + "step": 3740 + }, + { + "epoch": 0.9814184768385239, + "grad_norm": 8.994680404663086, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": -1.3201556205749512, + "logits/rejected": -1.1730735301971436, + "logps/chosen": -524.9544677734375, + "logps/rejected": -640.7349853515625, + "loss": 0.451, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6050658226013184, + "rewards/margins": 1.3061447143554688, + "rewards/rejected": -3.911210536956787, + "step": 3750 + }, + { + "epoch": 0.98403559277676, + "grad_norm": 9.462129592895508, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": -1.3206380605697632, + "logits/rejected": -1.223382830619812, + "logps/chosen": -523.351318359375, + "logps/rejected": -627.8754272460938, + "loss": 0.4966, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.58086895942688, + "rewards/margins": 1.3636963367462158, + "rewards/rejected": -3.9445652961730957, + "step": 3760 + }, + { + "epoch": 0.9866527087149961, + "grad_norm": 11.428803443908691, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": -1.3305742740631104, + "logits/rejected": -1.1541904211044312, + "logps/chosen": -536.9759521484375, + "logps/rejected": -636.7061157226562, + "loss": 0.4246, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.528337001800537, + "rewards/margins": 1.2519800662994385, + "rewards/rejected": -3.7803173065185547, + "step": 3770 + }, + { + "epoch": 0.9892698246532321, + "grad_norm": 10.81966781616211, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": -1.2256147861480713, + "logits/rejected": -1.1796106100082397, + "logps/chosen": -514.3230590820312, + "logps/rejected": -642.4959716796875, + "loss": 0.5022, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.645564556121826, + "rewards/margins": 1.0614073276519775, + "rewards/rejected": -3.7069716453552246, + "step": 3780 + }, + { + "epoch": 0.9918869405914682, + "grad_norm": 16.201265335083008, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": -1.2533369064331055, + "logits/rejected": -1.0869061946868896, + "logps/chosen": -542.9913330078125, + "logps/rejected": -650.4954833984375, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.687774181365967, + "rewards/margins": 1.3033173084259033, + "rewards/rejected": -3.99109148979187, + "step": 3790 + }, + { + "epoch": 0.9945040565297043, + "grad_norm": 7.363870143890381, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": -1.3605427742004395, + "logits/rejected": -1.2153841257095337, + "logps/chosen": -546.0870971679688, + "logps/rejected": -652.19921875, + "loss": 0.4996, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6433169841766357, + "rewards/margins": 1.3066623210906982, + "rewards/rejected": -3.949979305267334, + "step": 3800 + }, + { + "epoch": 0.9945040565297043, + "eval_logits/chosen": -1.1657898426055908, + "eval_logits/rejected": -1.0296279191970825, + "eval_logps/chosen": -526.756103515625, + "eval_logps/rejected": -636.4028930664062, + "eval_loss": 0.4788345396518707, + "eval_rewards/accuracies": 0.7475000023841858, + "eval_rewards/chosen": -2.62154221534729, + "eval_rewards/margins": 1.2967824935913086, + "eval_rewards/rejected": -3.9183249473571777, + "eval_runtime": 1598.3049, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.156, + "step": 3800 + }, + { + "epoch": 0.9971211724679403, + "grad_norm": 12.966708183288574, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": -1.2771844863891602, + "logits/rejected": -1.147637128829956, + "logps/chosen": -510.83001708984375, + "logps/rejected": -608.63037109375, + "loss": 0.503, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5848309993743896, + "rewards/margins": 1.134263277053833, + "rewards/rejected": -3.7190945148468018, + "step": 3810 + }, + { + "epoch": 0.9997382884061764, + "grad_norm": 16.982664108276367, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": -1.262603998184204, + "logits/rejected": -1.0868072509765625, + "logps/chosen": -551.4014892578125, + "logps/rejected": -662.4216918945312, + "loss": 0.4568, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6468563079833984, + "rewards/margins": 1.5578194856643677, + "rewards/rejected": -4.204675674438477, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.517807064771465, + "train_runtime": 164396.369, + "train_samples_per_second": 0.372, + "train_steps_per_second": 0.023 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}