zephyr-7b-dpo-qlora / trainer_state.json
guoqiang-x's picture
Model save
eebedcd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00026171159382360636,
"grad_norm": 2.427435874938965,
"learning_rate": 1.3054830287206268e-08,
"logits/chosen": -2.452890634536743,
"logits/rejected": -2.3576245307922363,
"logps/chosen": -290.49053955078125,
"logps/rejected": -374.69940185546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0026171159382360636,
"grad_norm": 2.4065892696380615,
"learning_rate": 1.3054830287206266e-07,
"logits/chosen": -2.280916452407837,
"logits/rejected": -2.18080735206604,
"logps/chosen": -279.5721435546875,
"logps/rejected": -245.38124084472656,
"loss": 0.6931,
"rewards/accuracies": 0.4236111044883728,
"rewards/chosen": 0.0002959521661978215,
"rewards/margins": 4.458064722712152e-05,
"rewards/rejected": 0.0002513715880922973,
"step": 10
},
{
"epoch": 0.005234231876472127,
"grad_norm": 2.543537139892578,
"learning_rate": 2.610966057441253e-07,
"logits/chosen": -2.286400318145752,
"logits/rejected": -2.1322734355926514,
"logps/chosen": -305.47900390625,
"logps/rejected": -237.6411895751953,
"loss": 0.6926,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0033905524760484695,
"rewards/margins": 0.0010894734878093004,
"rewards/rejected": 0.002301078988239169,
"step": 20
},
{
"epoch": 0.007851347814708191,
"grad_norm": 2.317607879638672,
"learning_rate": 3.9164490861618804e-07,
"logits/chosen": -2.2721304893493652,
"logits/rejected": -2.2249627113342285,
"logps/chosen": -251.0873565673828,
"logps/rejected": -251.26864624023438,
"loss": 0.6923,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.012264861725270748,
"rewards/margins": 0.0016630779718980193,
"rewards/rejected": 0.010601785033941269,
"step": 30
},
{
"epoch": 0.010468463752944255,
"grad_norm": 1.9544142484664917,
"learning_rate": 5.221932114882506e-07,
"logits/chosen": -2.1681597232818604,
"logits/rejected": -2.1325502395629883,
"logps/chosen": -216.1050262451172,
"logps/rejected": -221.6034698486328,
"loss": 0.6911,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.019591109827160835,
"rewards/margins": 0.00413005193695426,
"rewards/rejected": 0.015461057424545288,
"step": 40
},
{
"epoch": 0.01308557969118032,
"grad_norm": 2.0900888442993164,
"learning_rate": 6.527415143603135e-07,
"logits/chosen": -2.2135119438171387,
"logits/rejected": -2.1745445728302,
"logps/chosen": -266.76007080078125,
"logps/rejected": -234.2284698486328,
"loss": 0.6907,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02980896458029747,
"rewards/margins": 0.005023510195314884,
"rewards/rejected": 0.02478545531630516,
"step": 50
},
{
"epoch": 0.015702695629416383,
"grad_norm": 2.1390092372894287,
"learning_rate": 7.832898172323761e-07,
"logits/chosen": -2.1692872047424316,
"logits/rejected": -2.1056342124938965,
"logps/chosen": -252.186767578125,
"logps/rejected": -226.5349884033203,
"loss": 0.6901,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.03267771750688553,
"rewards/margins": 0.0062465183436870575,
"rewards/rejected": 0.026431197300553322,
"step": 60
},
{
"epoch": 0.018319811567652448,
"grad_norm": 2.0599091053009033,
"learning_rate": 9.138381201044387e-07,
"logits/chosen": -2.309943675994873,
"logits/rejected": -2.187107563018799,
"logps/chosen": -271.86541748046875,
"logps/rejected": -246.50680541992188,
"loss": 0.6877,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.04242750257253647,
"rewards/margins": 0.011090461164712906,
"rewards/rejected": 0.031337037682533264,
"step": 70
},
{
"epoch": 0.02093692750588851,
"grad_norm": 2.3880298137664795,
"learning_rate": 1.0443864229765013e-06,
"logits/chosen": -2.2041609287261963,
"logits/rejected": -2.1138315200805664,
"logps/chosen": -257.4185485839844,
"logps/rejected": -246.7639923095703,
"loss": 0.6874,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03884550929069519,
"rewards/margins": 0.011877561919391155,
"rewards/rejected": 0.02696794643998146,
"step": 80
},
{
"epoch": 0.023554043444124574,
"grad_norm": 2.3031389713287354,
"learning_rate": 1.1749347258485642e-06,
"logits/chosen": -2.208482265472412,
"logits/rejected": -2.1343834400177,
"logps/chosen": -249.96255493164062,
"logps/rejected": -234.4242706298828,
"loss": 0.684,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.043103523552417755,
"rewards/margins": 0.018813790753483772,
"rewards/rejected": 0.024289730936288834,
"step": 90
},
{
"epoch": 0.02617115938236064,
"grad_norm": 2.119929075241089,
"learning_rate": 1.305483028720627e-06,
"logits/chosen": -2.2504467964172363,
"logits/rejected": -2.178734540939331,
"logps/chosen": -246.7833251953125,
"logps/rejected": -230.8575897216797,
"loss": 0.6807,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.04890027642250061,
"rewards/margins": 0.025924110785126686,
"rewards/rejected": 0.022976163774728775,
"step": 100
},
{
"epoch": 0.02617115938236064,
"eval_logits/chosen": -2.1481568813323975,
"eval_logits/rejected": -2.055117607116699,
"eval_logps/chosen": -259.46044921875,
"eval_logps/rejected": -242.01309204101562,
"eval_loss": 0.6808694005012512,
"eval_rewards/accuracies": 0.6554999947547913,
"eval_rewards/chosen": 0.05141494795680046,
"eval_rewards/margins": 0.025842413306236267,
"eval_rewards/rejected": 0.025572534650564194,
"eval_runtime": 1599.8543,
"eval_samples_per_second": 1.25,
"eval_steps_per_second": 0.156,
"step": 100
},
{
"epoch": 0.028788275320596704,
"grad_norm": 2.4198131561279297,
"learning_rate": 1.4360313315926894e-06,
"logits/chosen": -2.2423720359802246,
"logits/rejected": -2.1254634857177734,
"logps/chosen": -284.2754821777344,
"logps/rejected": -239.1751251220703,
"loss": 0.677,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.050642453134059906,
"rewards/margins": 0.03400001674890518,
"rewards/rejected": 0.016642430797219276,
"step": 110
},
{
"epoch": 0.031405391258832765,
"grad_norm": 2.272566556930542,
"learning_rate": 1.5665796344647521e-06,
"logits/chosen": -2.273714303970337,
"logits/rejected": -2.160338878631592,
"logps/chosen": -287.36285400390625,
"logps/rejected": -272.5426025390625,
"loss": 0.6696,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.05583573505282402,
"rewards/margins": 0.04974224418401718,
"rewards/rejected": 0.006093493662774563,
"step": 120
},
{
"epoch": 0.03402250719706883,
"grad_norm": 2.827535390853882,
"learning_rate": 1.6971279373368146e-06,
"logits/chosen": -2.2895429134368896,
"logits/rejected": -2.1921463012695312,
"logps/chosen": -250.36807250976562,
"logps/rejected": -254.2834930419922,
"loss": 0.664,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.04658503085374832,
"rewards/margins": 0.06275991350412369,
"rewards/rejected": -0.01617487706243992,
"step": 130
},
{
"epoch": 0.036639623135304895,
"grad_norm": 2.8360142707824707,
"learning_rate": 1.8276762402088774e-06,
"logits/chosen": -2.285165309906006,
"logits/rejected": -2.075157880783081,
"logps/chosen": -272.5437927246094,
"logps/rejected": -229.80880737304688,
"loss": 0.662,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.021640608087182045,
"rewards/margins": 0.06897087395191193,
"rewards/rejected": -0.047330256551504135,
"step": 140
},
{
"epoch": 0.03925673907354096,
"grad_norm": 3.0254032611846924,
"learning_rate": 1.9582245430809403e-06,
"logits/chosen": -2.289304494857788,
"logits/rejected": -2.16825532913208,
"logps/chosen": -283.7846984863281,
"logps/rejected": -248.1438446044922,
"loss": 0.6606,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.0013313032686710358,
"rewards/margins": 0.07430683076381683,
"rewards/rejected": -0.0729755312204361,
"step": 150
},
{
"epoch": 0.04187385501177702,
"grad_norm": 3.2089273929595947,
"learning_rate": 2.0887728459530026e-06,
"logits/chosen": -2.209859609603882,
"logits/rejected": -2.1506431102752686,
"logps/chosen": -262.52569580078125,
"logps/rejected": -270.04766845703125,
"loss": 0.6632,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.05757613852620125,
"rewards/margins": 0.07334659993648529,
"rewards/rejected": -0.13092274963855743,
"step": 160
},
{
"epoch": 0.04449097095001309,
"grad_norm": 3.7007648944854736,
"learning_rate": 2.2193211488250653e-06,
"logits/chosen": -2.190873384475708,
"logits/rejected": -2.1105270385742188,
"logps/chosen": -227.3632049560547,
"logps/rejected": -236.8821563720703,
"loss": 0.6635,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.0691681057214737,
"rewards/margins": 0.0719941109418869,
"rewards/rejected": -0.1411622166633606,
"step": 170
},
{
"epoch": 0.04710808688824915,
"grad_norm": 5.2089338302612305,
"learning_rate": 2.3498694516971284e-06,
"logits/chosen": -2.2085936069488525,
"logits/rejected": -2.1274473667144775,
"logps/chosen": -273.2780456542969,
"logps/rejected": -269.22747802734375,
"loss": 0.6559,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.17684438824653625,
"rewards/margins": 0.0919983834028244,
"rewards/rejected": -0.26884278655052185,
"step": 180
},
{
"epoch": 0.04972520282648522,
"grad_norm": 4.031327724456787,
"learning_rate": 2.4804177545691907e-06,
"logits/chosen": -2.2836358547210693,
"logits/rejected": -2.1660006046295166,
"logps/chosen": -281.2835388183594,
"logps/rejected": -266.60821533203125,
"loss": 0.637,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.11275825649499893,
"rewards/margins": 0.13688938319683075,
"rewards/rejected": -0.24964764714241028,
"step": 190
},
{
"epoch": 0.05234231876472128,
"grad_norm": 6.425544261932373,
"learning_rate": 2.610966057441254e-06,
"logits/chosen": -2.2068774700164795,
"logits/rejected": -2.0849859714508057,
"logps/chosen": -266.81500244140625,
"logps/rejected": -241.287353515625,
"loss": 0.6438,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.1718008667230606,
"rewards/margins": 0.12619325518608093,
"rewards/rejected": -0.29799407720565796,
"step": 200
},
{
"epoch": 0.05234231876472128,
"eval_logits/chosen": -2.0999979972839355,
"eval_logits/rejected": -2.011294364929199,
"eval_logps/chosen": -283.4154357910156,
"eval_logps/rejected": -278.46148681640625,
"eval_loss": 0.6356053948402405,
"eval_rewards/accuracies": 0.6759999990463257,
"eval_rewards/chosen": -0.18813487887382507,
"eval_rewards/margins": 0.15077635645866394,
"eval_rewards/rejected": -0.3389112055301666,
"eval_runtime": 1598.5625,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 200
},
{
"epoch": 0.05495943470295734,
"grad_norm": 4.150570869445801,
"learning_rate": 2.741514360313316e-06,
"logits/chosen": -2.255913734436035,
"logits/rejected": -2.120842456817627,
"logps/chosen": -280.205810546875,
"logps/rejected": -268.5466613769531,
"loss": 0.6131,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.18164019286632538,
"rewards/margins": 0.19993841648101807,
"rewards/rejected": -0.38157862424850464,
"step": 210
},
{
"epoch": 0.05757655064119341,
"grad_norm": 4.034811496734619,
"learning_rate": 2.872062663185379e-06,
"logits/chosen": -2.1413655281066895,
"logits/rejected": -2.093209743499756,
"logps/chosen": -298.56549072265625,
"logps/rejected": -289.4757995605469,
"loss": 0.6254,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.4465310573577881,
"rewards/margins": 0.17338070273399353,
"rewards/rejected": -0.619911789894104,
"step": 220
},
{
"epoch": 0.06019366657942947,
"grad_norm": 6.063634395599365,
"learning_rate": 3.0026109660574416e-06,
"logits/chosen": -2.266742706298828,
"logits/rejected": -2.164170742034912,
"logps/chosen": -352.3594665527344,
"logps/rejected": -333.99053955078125,
"loss": 0.6479,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.46258726716041565,
"rewards/margins": 0.17155149579048157,
"rewards/rejected": -0.6341387033462524,
"step": 230
},
{
"epoch": 0.06281078251766553,
"grad_norm": 5.352989673614502,
"learning_rate": 3.1331592689295043e-06,
"logits/chosen": -2.136970281600952,
"logits/rejected": -2.0645029544830322,
"logps/chosen": -342.98870849609375,
"logps/rejected": -350.2208251953125,
"loss": 0.6237,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.4678193926811218,
"rewards/margins": 0.2228115350008011,
"rewards/rejected": -0.6906309127807617,
"step": 240
},
{
"epoch": 0.06542789845590159,
"grad_norm": 6.087672233581543,
"learning_rate": 3.263707571801567e-06,
"logits/chosen": -2.162543296813965,
"logits/rejected": -2.1394383907318115,
"logps/chosen": -298.310302734375,
"logps/rejected": -299.23260498046875,
"loss": 0.6014,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3882629871368408,
"rewards/margins": 0.26374003291130066,
"rewards/rejected": -0.6520029902458191,
"step": 250
},
{
"epoch": 0.06804501439413765,
"grad_norm": 9.075284004211426,
"learning_rate": 3.3942558746736293e-06,
"logits/chosen": -2.2069649696350098,
"logits/rejected": -2.0832362174987793,
"logps/chosen": -315.36358642578125,
"logps/rejected": -315.8193664550781,
"loss": 0.6231,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.43095770478248596,
"rewards/margins": 0.2242399901151657,
"rewards/rejected": -0.6551976203918457,
"step": 260
},
{
"epoch": 0.07066213033237373,
"grad_norm": 4.0352067947387695,
"learning_rate": 3.524804177545692e-06,
"logits/chosen": -2.1507174968719482,
"logits/rejected": -2.084745407104492,
"logps/chosen": -323.89361572265625,
"logps/rejected": -321.73907470703125,
"loss": 0.5962,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6326006650924683,
"rewards/margins": 0.293195515871048,
"rewards/rejected": -0.9257962107658386,
"step": 270
},
{
"epoch": 0.07327924627060979,
"grad_norm": 6.033257007598877,
"learning_rate": 3.6553524804177547e-06,
"logits/chosen": -2.17421293258667,
"logits/rejected": -2.054452896118164,
"logps/chosen": -340.52557373046875,
"logps/rejected": -340.84259033203125,
"loss": 0.627,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.8170977830886841,
"rewards/margins": 0.2173783779144287,
"rewards/rejected": -1.0344761610031128,
"step": 280
},
{
"epoch": 0.07589636220884585,
"grad_norm": 6.212845325469971,
"learning_rate": 3.7859007832898174e-06,
"logits/chosen": -2.1532936096191406,
"logits/rejected": -2.0849764347076416,
"logps/chosen": -353.55975341796875,
"logps/rejected": -354.45782470703125,
"loss": 0.6139,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6797887086868286,
"rewards/margins": 0.26041343808174133,
"rewards/rejected": -0.9402019381523132,
"step": 290
},
{
"epoch": 0.07851347814708191,
"grad_norm": 9.043365478515625,
"learning_rate": 3.9164490861618806e-06,
"logits/chosen": -2.1526737213134766,
"logits/rejected": -2.0208210945129395,
"logps/chosen": -306.4209899902344,
"logps/rejected": -320.6431884765625,
"loss": 0.6073,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6231580972671509,
"rewards/margins": 0.2800864577293396,
"rewards/rejected": -0.9032446146011353,
"step": 300
},
{
"epoch": 0.07851347814708191,
"eval_logits/chosen": -2.0781641006469727,
"eval_logits/rejected": -1.9948630332946777,
"eval_logps/chosen": -333.25830078125,
"eval_logps/rejected": -342.0091247558594,
"eval_loss": 0.6053693890571594,
"eval_rewards/accuracies": 0.6815000176429749,
"eval_rewards/chosen": -0.6865635514259338,
"eval_rewards/margins": 0.2878238558769226,
"eval_rewards/rejected": -0.9743873476982117,
"eval_runtime": 1598.3515,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 300
},
{
"epoch": 0.08113059408531798,
"grad_norm": 8.154093742370605,
"learning_rate": 4.046997389033943e-06,
"logits/chosen": -2.2768380641937256,
"logits/rejected": -2.16288161277771,
"logps/chosen": -355.626953125,
"logps/rejected": -341.3152770996094,
"loss": 0.5637,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6677303910255432,
"rewards/margins": 0.39315730333328247,
"rewards/rejected": -1.0608876943588257,
"step": 310
},
{
"epoch": 0.08374771002355404,
"grad_norm": 12.870465278625488,
"learning_rate": 4.177545691906005e-06,
"logits/chosen": -2.2009758949279785,
"logits/rejected": -2.101364850997925,
"logps/chosen": -330.45904541015625,
"logps/rejected": -343.94305419921875,
"loss": 0.6039,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7225368022918701,
"rewards/margins": 0.30548617243766785,
"rewards/rejected": -1.0280230045318604,
"step": 320
},
{
"epoch": 0.08636482596179011,
"grad_norm": 5.590673923492432,
"learning_rate": 4.308093994778068e-06,
"logits/chosen": -2.086026668548584,
"logits/rejected": -2.0480690002441406,
"logps/chosen": -313.5097351074219,
"logps/rejected": -321.5640869140625,
"loss": 0.5895,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5396815538406372,
"rewards/margins": 0.322670042514801,
"rewards/rejected": -0.8623515963554382,
"step": 330
},
{
"epoch": 0.08898194190002617,
"grad_norm": 7.664637088775635,
"learning_rate": 4.4386422976501306e-06,
"logits/chosen": -2.1061110496520996,
"logits/rejected": -2.049543857574463,
"logps/chosen": -358.71868896484375,
"logps/rejected": -383.11328125,
"loss": 0.5694,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.6914780735969543,
"rewards/margins": 0.45771294832229614,
"rewards/rejected": -1.1491910219192505,
"step": 340
},
{
"epoch": 0.09159905783826224,
"grad_norm": 10.34107780456543,
"learning_rate": 4.569190600522193e-06,
"logits/chosen": -1.9783916473388672,
"logits/rejected": -1.901391625404358,
"logps/chosen": -401.55120849609375,
"logps/rejected": -426.84832763671875,
"loss": 0.6179,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1243749856948853,
"rewards/margins": 0.3678717613220215,
"rewards/rejected": -1.4922468662261963,
"step": 350
},
{
"epoch": 0.0942161737764983,
"grad_norm": 6.533565044403076,
"learning_rate": 4.699738903394257e-06,
"logits/chosen": -1.9793760776519775,
"logits/rejected": -1.922286033630371,
"logps/chosen": -367.68817138671875,
"logps/rejected": -385.93798828125,
"loss": 0.5861,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2674810886383057,
"rewards/margins": 0.39134687185287476,
"rewards/rejected": -1.6588280200958252,
"step": 360
},
{
"epoch": 0.09683328971473436,
"grad_norm": 9.993318557739258,
"learning_rate": 4.8302872062663196e-06,
"logits/chosen": -1.9993594884872437,
"logits/rejected": -1.8775148391723633,
"logps/chosen": -397.10125732421875,
"logps/rejected": -394.6053771972656,
"loss": 0.5852,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2026114463806152,
"rewards/margins": 0.4132401943206787,
"rewards/rejected": -1.615851640701294,
"step": 370
},
{
"epoch": 0.09945040565297043,
"grad_norm": 8.581938743591309,
"learning_rate": 4.9608355091383814e-06,
"logits/chosen": -1.9525811672210693,
"logits/rejected": -1.7932708263397217,
"logps/chosen": -388.2157287597656,
"logps/rejected": -399.0534973144531,
"loss": 0.5682,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.9025002717971802,
"rewards/margins": 0.44248518347740173,
"rewards/rejected": -1.3449854850769043,
"step": 380
},
{
"epoch": 0.1020675215912065,
"grad_norm": 17.113487243652344,
"learning_rate": 4.9999488562447675e-06,
"logits/chosen": -1.920275330543518,
"logits/rejected": -1.8368419408798218,
"logps/chosen": -342.7761535644531,
"logps/rejected": -365.7699279785156,
"loss": 0.5702,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.6181488633155823,
"rewards/margins": 0.4308691620826721,
"rewards/rejected": -1.0490180253982544,
"step": 390
},
{
"epoch": 0.10468463752944256,
"grad_norm": 18.049278259277344,
"learning_rate": 4.999698361256577e-06,
"logits/chosen": -1.844530463218689,
"logits/rejected": -1.724854826927185,
"logps/chosen": -372.54058837890625,
"logps/rejected": -370.39080810546875,
"loss": 0.5956,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.0718820095062256,
"rewards/margins": 0.3849504590034485,
"rewards/rejected": -1.4568325281143188,
"step": 400
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -1.6757981777191162,
"eval_logits/rejected": -1.5843830108642578,
"eval_logps/chosen": -409.4522399902344,
"eval_logps/rejected": -440.5653381347656,
"eval_loss": 0.5824012160301208,
"eval_rewards/accuracies": 0.6830000281333923,
"eval_rewards/chosen": -1.448502540588379,
"eval_rewards/margins": 0.5114473700523376,
"eval_rewards/rejected": -1.9599499702453613,
"eval_runtime": 1596.5137,
"eval_samples_per_second": 1.253,
"eval_steps_per_second": 0.157,
"step": 400
},
{
"epoch": 0.10730175346767862,
"grad_norm": 7.473143577575684,
"learning_rate": 4.999239142174581e-06,
"logits/chosen": -1.7772619724273682,
"logits/rejected": -1.71932053565979,
"logps/chosen": -362.2901306152344,
"logps/rejected": -392.97271728515625,
"loss": 0.6257,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2004709243774414,
"rewards/margins": 0.3244817852973938,
"rewards/rejected": -1.5249526500701904,
"step": 410
},
{
"epoch": 0.10991886940591468,
"grad_norm": 5.672206401824951,
"learning_rate": 4.99857123734344e-06,
"logits/chosen": -1.798180341720581,
"logits/rejected": -1.6758928298950195,
"logps/chosen": -342.5751037597656,
"logps/rejected": -385.69757080078125,
"loss": 0.5303,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1391090154647827,
"rewards/margins": 0.5212605595588684,
"rewards/rejected": -1.660369634628296,
"step": 420
},
{
"epoch": 0.11253598534415074,
"grad_norm": 7.920849800109863,
"learning_rate": 4.997694702533016e-06,
"logits/chosen": -1.839582085609436,
"logits/rejected": -1.7662830352783203,
"logps/chosen": -396.0287170410156,
"logps/rejected": -421.93707275390625,
"loss": 0.542,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1852526664733887,
"rewards/margins": 0.49562257528305054,
"rewards/rejected": -1.6808754205703735,
"step": 430
},
{
"epoch": 0.11515310128238682,
"grad_norm": 9.021227836608887,
"learning_rate": 4.996609610933713e-06,
"logits/chosen": -1.9095804691314697,
"logits/rejected": -1.8493105173110962,
"logps/chosen": -372.8834533691406,
"logps/rejected": -393.52532958984375,
"loss": 0.575,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.0081276893615723,
"rewards/margins": 0.4901418089866638,
"rewards/rejected": -1.4982694387435913,
"step": 440
},
{
"epoch": 0.11777021722062288,
"grad_norm": 8.462127685546875,
"learning_rate": 4.995316053150366e-06,
"logits/chosen": -1.7473382949829102,
"logits/rejected": -1.6900889873504639,
"logps/chosen": -388.23175048828125,
"logps/rejected": -426.498046875,
"loss": 0.5359,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.2048237323760986,
"rewards/margins": 0.612975537776947,
"rewards/rejected": -1.8177993297576904,
"step": 450
},
{
"epoch": 0.12038733315885894,
"grad_norm": 12.204924583435059,
"learning_rate": 4.9938141371946815e-06,
"logits/chosen": -1.7278436422348022,
"logits/rejected": -1.6576976776123047,
"logps/chosen": -472.71600341796875,
"logps/rejected": -530.8753662109375,
"loss": 0.5586,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.0762267112731934,
"rewards/margins": 0.7278280258178711,
"rewards/rejected": -2.8040547370910645,
"step": 460
},
{
"epoch": 0.123004449097095,
"grad_norm": 6.473967552185059,
"learning_rate": 4.992103988476206e-06,
"logits/chosen": -1.8171007633209229,
"logits/rejected": -1.712386131286621,
"logps/chosen": -393.3753662109375,
"logps/rejected": -441.9925231933594,
"loss": 0.5643,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4961185455322266,
"rewards/margins": 0.6115677356719971,
"rewards/rejected": -2.1076862812042236,
"step": 470
},
{
"epoch": 0.12562156503533106,
"grad_norm": 5.5269598960876465,
"learning_rate": 4.990185749791866e-06,
"logits/chosen": -1.9438987970352173,
"logits/rejected": -1.8433564901351929,
"logps/chosen": -341.74261474609375,
"logps/rejected": -403.5177917480469,
"loss": 0.5323,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8496583104133606,
"rewards/margins": 0.5994860529899597,
"rewards/rejected": -1.4491443634033203,
"step": 480
},
{
"epoch": 0.12823868097356714,
"grad_norm": 9.541817665100098,
"learning_rate": 4.9880595813140395e-06,
"logits/chosen": -1.9532935619354248,
"logits/rejected": -1.8457939624786377,
"logps/chosen": -374.6437072753906,
"logps/rejected": -403.85614013671875,
"loss": 0.5377,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8984916806221008,
"rewards/margins": 0.5953295826911926,
"rewards/rejected": -1.493821144104004,
"step": 490
},
{
"epoch": 0.13085579691180318,
"grad_norm": 13.012099266052246,
"learning_rate": 4.985725660577184e-06,
"logits/chosen": -1.8454961776733398,
"logits/rejected": -1.7134668827056885,
"logps/chosen": -394.9266052246094,
"logps/rejected": -415.48138427734375,
"loss": 0.5643,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2125742435455322,
"rewards/margins": 0.6179044842720032,
"rewards/rejected": -1.8304786682128906,
"step": 500
},
{
"epoch": 0.13085579691180318,
"eval_logits/chosen": -1.6658297777175903,
"eval_logits/rejected": -1.56244695186615,
"eval_logps/chosen": -379.18035888671875,
"eval_logps/rejected": -420.46356201171875,
"eval_loss": 0.5725830793380737,
"eval_rewards/accuracies": 0.6915000081062317,
"eval_rewards/chosen": -1.1457839012145996,
"eval_rewards/margins": 0.6131481528282166,
"eval_rewards/rejected": -1.758932113647461,
"eval_runtime": 1597.6305,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 500
},
{
"epoch": 0.13347291285003926,
"grad_norm": 9.327789306640625,
"learning_rate": 4.983184182463009e-06,
"logits/chosen": -1.7511212825775146,
"logits/rejected": -1.6526283025741577,
"logps/chosen": -403.2107238769531,
"logps/rejected": -449.2831115722656,
"loss": 0.5133,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2511955499649048,
"rewards/margins": 0.830196738243103,
"rewards/rejected": -2.0813920497894287,
"step": 510
},
{
"epoch": 0.1360900287882753,
"grad_norm": 20.807098388671875,
"learning_rate": 4.980435359184203e-06,
"logits/chosen": -1.7725025415420532,
"logits/rejected": -1.7354393005371094,
"logps/chosen": -416.3721618652344,
"logps/rejected": -457.4166564941406,
"loss": 0.5914,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4335078001022339,
"rewards/margins": 0.5734153985977173,
"rewards/rejected": -2.006923198699951,
"step": 520
},
{
"epoch": 0.13870714472651138,
"grad_norm": 9.319575309753418,
"learning_rate": 4.9774794202667236e-06,
"logits/chosen": -1.8060506582260132,
"logits/rejected": -1.7954254150390625,
"logps/chosen": -347.8722229003906,
"logps/rejected": -408.69842529296875,
"loss": 0.568,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8692201375961304,
"rewards/margins": 0.5232836008071899,
"rewards/rejected": -1.3925037384033203,
"step": 530
},
{
"epoch": 0.14132426066474746,
"grad_norm": 18.703466415405273,
"learning_rate": 4.974316612530615e-06,
"logits/chosen": -1.718746542930603,
"logits/rejected": -1.6051177978515625,
"logps/chosen": -397.3692932128906,
"logps/rejected": -437.828125,
"loss": 0.4615,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.1902121305465698,
"rewards/margins": 0.8715072870254517,
"rewards/rejected": -2.0617194175720215,
"step": 540
},
{
"epoch": 0.1439413766029835,
"grad_norm": 13.324700355529785,
"learning_rate": 4.970947200069416e-06,
"logits/chosen": -1.5196049213409424,
"logits/rejected": -1.463122010231018,
"logps/chosen": -523.3265380859375,
"logps/rejected": -561.0288696289062,
"loss": 0.6068,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.449286937713623,
"rewards/margins": 0.5750513076782227,
"rewards/rejected": -3.0243382453918457,
"step": 550
},
{
"epoch": 0.14655849254121958,
"grad_norm": 7.8647990226745605,
"learning_rate": 4.967371464228096e-06,
"logits/chosen": -1.6284644603729248,
"logits/rejected": -1.54505455493927,
"logps/chosen": -521.0708618164062,
"logps/rejected": -584.5469970703125,
"loss": 0.5398,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.636112689971924,
"rewards/margins": 0.6315088868141174,
"rewards/rejected": -3.2676215171813965,
"step": 560
},
{
"epoch": 0.14917560847945563,
"grad_norm": 8.891359329223633,
"learning_rate": 4.963589703579569e-06,
"logits/chosen": -1.7360155582427979,
"logits/rejected": -1.6161645650863647,
"logps/chosen": -575.2686767578125,
"logps/rejected": -591.63671875,
"loss": 0.6015,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.752664566040039,
"rewards/margins": 0.5114163160324097,
"rewards/rejected": -3.264080762863159,
"step": 570
},
{
"epoch": 0.1517927244176917,
"grad_norm": 13.137563705444336,
"learning_rate": 4.9596022338997615e-06,
"logits/chosen": -1.8721704483032227,
"logits/rejected": -1.7023900747299194,
"logps/chosen": -481.35992431640625,
"logps/rejected": -496.80792236328125,
"loss": 0.5479,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8363927602767944,
"rewards/margins": 0.565157949924469,
"rewards/rejected": -2.401550769805908,
"step": 580
},
{
"epoch": 0.15440984035592778,
"grad_norm": 9.32674789428711,
"learning_rate": 4.955409388141243e-06,
"logits/chosen": -1.9033949375152588,
"logits/rejected": -1.81440007686615,
"logps/chosen": -351.5168762207031,
"logps/rejected": -374.45159912109375,
"loss": 0.5774,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.9435514211654663,
"rewards/margins": 0.46548739075660706,
"rewards/rejected": -1.4090386629104614,
"step": 590
},
{
"epoch": 0.15702695629416383,
"grad_norm": 8.5282621383667,
"learning_rate": 4.951011516405429e-06,
"logits/chosen": -1.9967750310897827,
"logits/rejected": -1.959011435508728,
"logps/chosen": -324.1109924316406,
"logps/rejected": -367.7194519042969,
"loss": 0.5373,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7343847155570984,
"rewards/margins": 0.5855700373649597,
"rewards/rejected": -1.319954752922058,
"step": 600
},
{
"epoch": 0.15702695629416383,
"eval_logits/chosen": -1.7954951524734497,
"eval_logits/rejected": -1.6945267915725708,
"eval_logps/chosen": -377.46051025390625,
"eval_logps/rejected": -426.21209716796875,
"eval_loss": 0.5631101727485657,
"eval_rewards/accuracies": 0.703000009059906,
"eval_rewards/chosen": -1.1285854578018188,
"eval_rewards/margins": 0.6878318190574646,
"eval_rewards/rejected": -1.8164173364639282,
"eval_runtime": 1597.4817,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 600
},
{
"epoch": 0.1596440722323999,
"grad_norm": 8.468684196472168,
"learning_rate": 4.946408985913344e-06,
"logits/chosen": -1.8055137395858765,
"logits/rejected": -1.7273778915405273,
"logps/chosen": -379.7837829589844,
"logps/rejected": -452.4205627441406,
"loss": 0.5374,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.3405410051345825,
"rewards/margins": 0.9033535122871399,
"rewards/rejected": -2.243894577026367,
"step": 610
},
{
"epoch": 0.16226118817063595,
"grad_norm": 8.468870162963867,
"learning_rate": 4.941602180974958e-06,
"logits/chosen": -1.712264060974121,
"logits/rejected": -1.5265555381774902,
"logps/chosen": -442.0708923339844,
"logps/rejected": -476.46112060546875,
"loss": 0.5459,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5436513423919678,
"rewards/margins": 0.9296489953994751,
"rewards/rejected": -2.4733004570007324,
"step": 620
},
{
"epoch": 0.16487830410887203,
"grad_norm": 8.517386436462402,
"learning_rate": 4.936591502957101e-06,
"logits/chosen": -1.650029182434082,
"logits/rejected": -1.5338895320892334,
"logps/chosen": -383.0647888183594,
"logps/rejected": -462.663330078125,
"loss": 0.5098,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3505860567092896,
"rewards/margins": 0.8982070684432983,
"rewards/rejected": -2.248793125152588,
"step": 630
},
{
"epoch": 0.16749542004710807,
"grad_norm": 7.095081329345703,
"learning_rate": 4.931377370249946e-06,
"logits/chosen": -1.6547809839248657,
"logits/rejected": -1.4826760292053223,
"logps/chosen": -434.3949279785156,
"logps/rejected": -483.05657958984375,
"loss": 0.5251,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.691422462463379,
"rewards/margins": 0.7445409297943115,
"rewards/rejected": -2.4359633922576904,
"step": 640
},
{
"epoch": 0.17011253598534415,
"grad_norm": 16.53241539001465,
"learning_rate": 4.925960218232073e-06,
"logits/chosen": -1.586260199546814,
"logits/rejected": -1.4878358840942383,
"logps/chosen": -451.49871826171875,
"logps/rejected": -535.3369750976562,
"loss": 0.5454,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9628738164901733,
"rewards/margins": 0.9344732165336609,
"rewards/rejected": -2.8973469734191895,
"step": 650
},
{
"epoch": 0.17272965192358022,
"grad_norm": 12.95315170288086,
"learning_rate": 4.920340499234116e-06,
"logits/chosen": -1.5459370613098145,
"logits/rejected": -1.3826103210449219,
"logps/chosen": -457.12640380859375,
"logps/rejected": -501.2940368652344,
"loss": 0.5561,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.9127668142318726,
"rewards/margins": 0.7934447526931763,
"rewards/rejected": -2.706211566925049,
"step": 660
},
{
"epoch": 0.17534676786181627,
"grad_norm": 13.084680557250977,
"learning_rate": 4.914518682500995e-06,
"logits/chosen": -1.6595103740692139,
"logits/rejected": -1.5149381160736084,
"logps/chosen": -505.25360107421875,
"logps/rejected": -557.6201782226562,
"loss": 0.5516,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.2182066440582275,
"rewards/margins": 0.930493950843811,
"rewards/rejected": -3.1487009525299072,
"step": 670
},
{
"epoch": 0.17796388380005235,
"grad_norm": 9.387535095214844,
"learning_rate": 4.9084952541527315e-06,
"logits/chosen": -1.5597246885299683,
"logits/rejected": -1.4202911853790283,
"logps/chosen": -468.02777099609375,
"logps/rejected": -508.26483154296875,
"loss": 0.5279,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.9475713968276978,
"rewards/margins": 0.8734270334243774,
"rewards/rejected": -2.820998430252075,
"step": 680
},
{
"epoch": 0.1805809997382884,
"grad_norm": 10.536702156066895,
"learning_rate": 4.902270717143858e-06,
"logits/chosen": -1.6048400402069092,
"logits/rejected": -1.5331923961639404,
"logps/chosen": -395.47686767578125,
"logps/rejected": -517.7385864257812,
"loss": 0.4506,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5932337045669556,
"rewards/margins": 1.1062676906585693,
"rewards/rejected": -2.6995015144348145,
"step": 690
},
{
"epoch": 0.18319811567652447,
"grad_norm": 6.061295509338379,
"learning_rate": 4.895845591221427e-06,
"logits/chosen": -1.5165598392486572,
"logits/rejected": -1.4684141874313354,
"logps/chosen": -450.1396484375,
"logps/rejected": -525.9497680664062,
"loss": 0.5394,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9692981243133545,
"rewards/margins": 0.802649199962616,
"rewards/rejected": -2.7719473838806152,
"step": 700
},
{
"epoch": 0.18319811567652447,
"eval_logits/chosen": -1.271895408630371,
"eval_logits/rejected": -1.1628035306930542,
"eval_logps/chosen": -491.6012268066406,
"eval_logps/rejected": -551.1991577148438,
"eval_loss": 0.5473812222480774,
"eval_rewards/accuracies": 0.7039999961853027,
"eval_rewards/chosen": -2.2699923515319824,
"eval_rewards/margins": 0.7962960004806519,
"eval_rewards/rejected": -3.066288471221924,
"eval_runtime": 1597.7171,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 700
},
{
"epoch": 0.18581523161476055,
"grad_norm": 13.924752235412598,
"learning_rate": 4.8892204128816e-06,
"logits/chosen": -1.3581098318099976,
"logits/rejected": -1.2599608898162842,
"logps/chosen": -515.8197631835938,
"logps/rejected": -587.2666625976562,
"loss": 0.5115,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.4833767414093018,
"rewards/margins": 0.8504128456115723,
"rewards/rejected": -3.333789348602295,
"step": 710
},
{
"epoch": 0.1884323475529966,
"grad_norm": 9.558119773864746,
"learning_rate": 4.882395735324864e-06,
"logits/chosen": -1.140836477279663,
"logits/rejected": -0.9971574544906616,
"logps/chosen": -565.4805908203125,
"logps/rejected": -648.9151611328125,
"loss": 0.4868,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.9943554401397705,
"rewards/margins": 1.006219744682312,
"rewards/rejected": -4.000575065612793,
"step": 720
},
{
"epoch": 0.19104946349123267,
"grad_norm": 16.967636108398438,
"learning_rate": 4.87537212840983e-06,
"logits/chosen": -1.0284086465835571,
"logits/rejected": -0.9112738370895386,
"logps/chosen": -624.4201049804688,
"logps/rejected": -673.4655151367188,
"loss": 0.6183,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.582063674926758,
"rewards/margins": 0.7967410087585449,
"rewards/rejected": -4.378804683685303,
"step": 730
},
{
"epoch": 0.19366657942946872,
"grad_norm": 17.04477310180664,
"learning_rate": 4.8681501786056545e-06,
"logits/chosen": -1.1650705337524414,
"logits/rejected": -1.0339478254318237,
"logps/chosen": -438.68585205078125,
"logps/rejected": -490.553955078125,
"loss": 0.497,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1054441928863525,
"rewards/margins": 0.8412445187568665,
"rewards/rejected": -2.946688652038574,
"step": 740
},
{
"epoch": 0.1962836953677048,
"grad_norm": 20.108728408813477,
"learning_rate": 4.860730488943068e-06,
"logits/chosen": -1.2052314281463623,
"logits/rejected": -1.157947301864624,
"logps/chosen": -439.97589111328125,
"logps/rejected": -518.9736328125,
"loss": 0.5097,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.0552916526794434,
"rewards/margins": 0.809428870677948,
"rewards/rejected": -2.864720106124878,
"step": 750
},
{
"epoch": 0.19890081130594087,
"grad_norm": 6.034134387969971,
"learning_rate": 4.853113678964022e-06,
"logits/chosen": -1.3641645908355713,
"logits/rejected": -1.3035409450531006,
"logps/chosen": -421.71343994140625,
"logps/rejected": -494.55810546875,
"loss": 0.5004,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4936286211013794,
"rewards/margins": 0.8079965710639954,
"rewards/rejected": -2.3016250133514404,
"step": 760
},
{
"epoch": 0.20151792724417691,
"grad_norm": 5.774670124053955,
"learning_rate": 4.845300384669958e-06,
"logits/chosen": -1.4722392559051514,
"logits/rejected": -1.3768599033355713,
"logps/chosen": -374.771484375,
"logps/rejected": -418.16656494140625,
"loss": 0.5331,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2165186405181885,
"rewards/margins": 0.6629087328910828,
"rewards/rejected": -1.8794273138046265,
"step": 770
},
{
"epoch": 0.204135043182413,
"grad_norm": 8.666418075561523,
"learning_rate": 4.837291258468701e-06,
"logits/chosen": -1.63046395778656,
"logits/rejected": -1.5205990076065063,
"logps/chosen": -421.2227478027344,
"logps/rejected": -473.2460021972656,
"loss": 0.5528,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3603990077972412,
"rewards/margins": 0.7691534161567688,
"rewards/rejected": -2.1295523643493652,
"step": 780
},
{
"epoch": 0.20675215912064904,
"grad_norm": 15.76352310180664,
"learning_rate": 4.829086969119984e-06,
"logits/chosen": -1.4890462160110474,
"logits/rejected": -1.4975069761276245,
"logps/chosen": -429.3702087402344,
"logps/rejected": -498.40496826171875,
"loss": 0.5898,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7876300811767578,
"rewards/margins": 0.6727088093757629,
"rewards/rejected": -2.460339069366455,
"step": 790
},
{
"epoch": 0.2093692750588851,
"grad_norm": 14.872965812683105,
"learning_rate": 4.820688201679605e-06,
"logits/chosen": -1.6982934474945068,
"logits/rejected": -1.4959887266159058,
"logps/chosen": -433.38470458984375,
"logps/rejected": -451.4923400878906,
"loss": 0.4983,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.708482027053833,
"rewards/margins": 0.8051029443740845,
"rewards/rejected": -2.513584852218628,
"step": 800
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -1.599565863609314,
"eval_logits/rejected": -1.510375738143921,
"eval_logps/chosen": -420.76544189453125,
"eval_logps/rejected": -474.2269287109375,
"eval_loss": 0.5322815179824829,
"eval_rewards/accuracies": 0.7225000262260437,
"eval_rewards/chosen": -1.5616350173950195,
"eval_rewards/margins": 0.7349306344985962,
"eval_rewards/rejected": -2.2965660095214844,
"eval_runtime": 1595.9174,
"eval_samples_per_second": 1.253,
"eval_steps_per_second": 0.157,
"step": 800
},
{
"epoch": 0.21198639099712116,
"grad_norm": 6.158263683319092,
"learning_rate": 4.8120956574422315e-06,
"logits/chosen": -1.8132537603378296,
"logits/rejected": -1.8140255212783813,
"logps/chosen": -400.74359130859375,
"logps/rejected": -452.96392822265625,
"loss": 0.58,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2691619396209717,
"rewards/margins": 0.5926799178123474,
"rewards/rejected": -1.8618419170379639,
"step": 810
},
{
"epoch": 0.21460350693535724,
"grad_norm": 14.991472244262695,
"learning_rate": 4.803310053882831e-06,
"logits/chosen": -1.883763313293457,
"logits/rejected": -1.9024746417999268,
"logps/chosen": -327.5868225097656,
"logps/rejected": -404.1509704589844,
"loss": 0.5258,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9834963083267212,
"rewards/margins": 0.6419707536697388,
"rewards/rejected": -1.6254669427871704,
"step": 820
},
{
"epoch": 0.2172206228735933,
"grad_norm": 10.492687225341797,
"learning_rate": 4.794332124596775e-06,
"logits/chosen": -1.8571357727050781,
"logits/rejected": -1.813534140586853,
"logps/chosen": -396.38372802734375,
"logps/rejected": -451.81634521484375,
"loss": 0.5868,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2799537181854248,
"rewards/margins": 0.5913030505180359,
"rewards/rejected": -1.8712568283081055,
"step": 830
},
{
"epoch": 0.21983773881182936,
"grad_norm": 10.114534378051758,
"learning_rate": 4.785162619238575e-06,
"logits/chosen": -1.8008487224578857,
"logits/rejected": -1.698720932006836,
"logps/chosen": -378.2573547363281,
"logps/rejected": -424.20782470703125,
"loss": 0.5402,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.244909405708313,
"rewards/margins": 0.7323002815246582,
"rewards/rejected": -1.9772096872329712,
"step": 840
},
{
"epoch": 0.22245485475006543,
"grad_norm": 7.237858295440674,
"learning_rate": 4.775802303459288e-06,
"logits/chosen": -1.6909987926483154,
"logits/rejected": -1.64451003074646,
"logps/chosen": -384.024169921875,
"logps/rejected": -455.19024658203125,
"loss": 0.5251,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3631868362426758,
"rewards/margins": 0.7588311433792114,
"rewards/rejected": -2.1220178604125977,
"step": 850
},
{
"epoch": 0.22507197068830148,
"grad_norm": 20.08100700378418,
"learning_rate": 4.766251958842589e-06,
"logits/chosen": -1.5939120054244995,
"logits/rejected": -1.5511482954025269,
"logps/chosen": -447.6766662597656,
"logps/rejected": -501.3028259277344,
"loss": 0.569,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7521028518676758,
"rewards/margins": 0.6410677433013916,
"rewards/rejected": -2.3931705951690674,
"step": 860
},
{
"epoch": 0.22768908662653756,
"grad_norm": 8.703922271728516,
"learning_rate": 4.7565123828395066e-06,
"logits/chosen": -1.5731687545776367,
"logits/rejected": -1.5117802619934082,
"logps/chosen": -424.31463623046875,
"logps/rejected": -501.74725341796875,
"loss": 0.5338,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6849206686019897,
"rewards/margins": 0.7798541784286499,
"rewards/rejected": -2.4647748470306396,
"step": 870
},
{
"epoch": 0.23030620256477363,
"grad_norm": 7.03041410446167,
"learning_rate": 4.746584388701831e-06,
"logits/chosen": -1.6322410106658936,
"logits/rejected": -1.613351583480835,
"logps/chosen": -416.93463134765625,
"logps/rejected": -476.6209411621094,
"loss": 0.5234,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5381996631622314,
"rewards/margins": 0.7384254932403564,
"rewards/rejected": -2.276625156402588,
"step": 880
},
{
"epoch": 0.23292331850300968,
"grad_norm": 7.439998149871826,
"learning_rate": 4.736468805414218e-06,
"logits/chosen": -1.6738321781158447,
"logits/rejected": -1.6679372787475586,
"logps/chosen": -351.4508056640625,
"logps/rejected": -435.82196044921875,
"loss": 0.546,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0406602621078491,
"rewards/margins": 0.7058707475662231,
"rewards/rejected": -1.7465311288833618,
"step": 890
},
{
"epoch": 0.23554043444124576,
"grad_norm": 14.334321975708008,
"learning_rate": 4.7261664776249595e-06,
"logits/chosen": -1.5745285749435425,
"logits/rejected": -1.511311650276184,
"logps/chosen": -331.2271423339844,
"logps/rejected": -421.39752197265625,
"loss": 0.4763,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.024084210395813,
"rewards/margins": 0.9915010333061218,
"rewards/rejected": -2.015585422515869,
"step": 900
},
{
"epoch": 0.23554043444124576,
"eval_logits/chosen": -1.4989054203033447,
"eval_logits/rejected": -1.415571928024292,
"eval_logps/chosen": -425.90301513671875,
"eval_logps/rejected": -485.7890319824219,
"eval_loss": 0.5385720133781433,
"eval_rewards/accuracies": 0.7160000205039978,
"eval_rewards/chosen": -1.6130101680755615,
"eval_rewards/margins": 0.7991763353347778,
"eval_rewards/rejected": -2.412186622619629,
"eval_runtime": 1597.0973,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 900
},
{
"epoch": 0.2381575503794818,
"grad_norm": 12.946525573730469,
"learning_rate": 4.715678265575463e-06,
"logits/chosen": -1.5601496696472168,
"logits/rejected": -1.4170053005218506,
"logps/chosen": -460.2450256347656,
"logps/rejected": -471.73272705078125,
"loss": 0.5673,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.759927749633789,
"rewards/margins": 0.6714185476303101,
"rewards/rejected": -2.4313464164733887,
"step": 910
},
{
"epoch": 0.24077466631771788,
"grad_norm": 7.0246663093566895,
"learning_rate": 4.705005045028415e-06,
"logits/chosen": -1.5280827283859253,
"logits/rejected": -1.4348738193511963,
"logps/chosen": -420.05682373046875,
"logps/rejected": -474.1004333496094,
"loss": 0.5575,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.5702658891677856,
"rewards/margins": 0.7144767642021179,
"rewards/rejected": -2.284742832183838,
"step": 920
},
{
"epoch": 0.24339178225595393,
"grad_norm": 10.933419227600098,
"learning_rate": 4.694147707194659e-06,
"logits/chosen": -1.7351709604263306,
"logits/rejected": -1.6752073764801025,
"logps/chosen": -409.02301025390625,
"logps/rejected": -459.7960510253906,
"loss": 0.5172,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.3725465536117554,
"rewards/margins": 0.7059783935546875,
"rewards/rejected": -2.0785250663757324,
"step": 930
},
{
"epoch": 0.24600889819419,
"grad_norm": 9.32016372680664,
"learning_rate": 4.683107158658782e-06,
"logits/chosen": -1.6696386337280273,
"logits/rejected": -1.5904427766799927,
"logps/chosen": -443.6543884277344,
"logps/rejected": -504.19293212890625,
"loss": 0.4894,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5346601009368896,
"rewards/margins": 0.8928316235542297,
"rewards/rejected": -2.4274916648864746,
"step": 940
},
{
"epoch": 0.24862601413242608,
"grad_norm": 15.52425765991211,
"learning_rate": 4.671884321303407e-06,
"logits/chosen": -1.7015752792358398,
"logits/rejected": -1.627673864364624,
"logps/chosen": -428.1681213378906,
"logps/rejected": -498.12530517578125,
"loss": 0.5229,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.8192806243896484,
"rewards/margins": 0.843016505241394,
"rewards/rejected": -2.662297248840332,
"step": 950
},
{
"epoch": 0.2512431300706621,
"grad_norm": 9.358636856079102,
"learning_rate": 4.660480132232224e-06,
"logits/chosen": -1.7376630306243896,
"logits/rejected": -1.6711089611053467,
"logps/chosen": -425.44720458984375,
"logps/rejected": -465.4773864746094,
"loss": 0.5612,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.5372945070266724,
"rewards/margins": 0.6410431861877441,
"rewards/rejected": -2.178337812423706,
"step": 960
},
{
"epoch": 0.25386024600889817,
"grad_norm": 42.93048095703125,
"learning_rate": 4.6488955436917414e-06,
"logits/chosen": -1.6823867559432983,
"logits/rejected": -1.5265601873397827,
"logps/chosen": -445.4508361816406,
"logps/rejected": -489.4844665527344,
"loss": 0.53,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.655534029006958,
"rewards/margins": 0.9063774347305298,
"rewards/rejected": -2.5619113445281982,
"step": 970
},
{
"epoch": 0.2564773619471343,
"grad_norm": 10.477679252624512,
"learning_rate": 4.6371315229917644e-06,
"logits/chosen": -1.5334604978561401,
"logits/rejected": -1.4493004083633423,
"logps/chosen": -509.8636779785156,
"logps/rejected": -587.6615600585938,
"loss": 0.4944,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.265300750732422,
"rewards/margins": 0.9821793437004089,
"rewards/rejected": -3.2474799156188965,
"step": 980
},
{
"epoch": 0.2590944778853703,
"grad_norm": 12.81888198852539,
"learning_rate": 4.625189052424638e-06,
"logits/chosen": -1.4550929069519043,
"logits/rejected": -1.353437066078186,
"logps/chosen": -502.6724548339844,
"logps/rejected": -594.8109130859375,
"loss": 0.4652,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.6365292072296143,
"rewards/margins": 1.1491750478744507,
"rewards/rejected": -3.7857041358947754,
"step": 990
},
{
"epoch": 0.26171159382360637,
"grad_norm": 12.723479270935059,
"learning_rate": 4.613069129183218e-06,
"logits/chosen": -1.5442698001861572,
"logits/rejected": -1.4192157983779907,
"logps/chosen": -524.4293212890625,
"logps/rejected": -580.075439453125,
"loss": 0.5266,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.2106876373291016,
"rewards/margins": 0.9188516736030579,
"rewards/rejected": -3.1295390129089355,
"step": 1000
},
{
"epoch": 0.26171159382360637,
"eval_logits/chosen": -1.3050363063812256,
"eval_logits/rejected": -1.2043476104736328,
"eval_logps/chosen": -482.4830627441406,
"eval_logps/rejected": -550.0311279296875,
"eval_loss": 0.523389995098114,
"eval_rewards/accuracies": 0.7279999852180481,
"eval_rewards/chosen": -2.1788110733032227,
"eval_rewards/margins": 0.8757960796356201,
"eval_rewards/rejected": -3.0546071529388428,
"eval_runtime": 1595.4607,
"eval_samples_per_second": 1.254,
"eval_steps_per_second": 0.157,
"step": 1000
},
{
"epoch": 0.2643287097618425,
"grad_norm": 10.429312705993652,
"learning_rate": 4.600772765277607e-06,
"logits/chosen": -1.3304941654205322,
"logits/rejected": -1.263293743133545,
"logps/chosen": -446.70635986328125,
"logps/rejected": -535.6898193359375,
"loss": 0.4782,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.131920099258423,
"rewards/margins": 0.9416648745536804,
"rewards/rejected": -3.073584794998169,
"step": 1010
},
{
"epoch": 0.2669458257000785,
"grad_norm": 16.054344177246094,
"learning_rate": 4.588300987450652e-06,
"logits/chosen": -1.4466055631637573,
"logits/rejected": -1.3539087772369385,
"logps/chosen": -452.9022521972656,
"logps/rejected": -500.742919921875,
"loss": 0.5407,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.9927232265472412,
"rewards/margins": 0.8132057189941406,
"rewards/rejected": -2.8059287071228027,
"step": 1020
},
{
"epoch": 0.26956294163831457,
"grad_norm": 7.860910415649414,
"learning_rate": 4.5756548370922136e-06,
"logits/chosen": -1.514672875404358,
"logits/rejected": -1.447291374206543,
"logps/chosen": -394.26031494140625,
"logps/rejected": -461.3570251464844,
"loss": 0.5174,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.536415934562683,
"rewards/margins": 0.7608687281608582,
"rewards/rejected": -2.2972846031188965,
"step": 1030
},
{
"epoch": 0.2721800575765506,
"grad_norm": 13.006108283996582,
"learning_rate": 4.562835370152206e-06,
"logits/chosen": -1.559757113456726,
"logits/rejected": -1.4329888820648193,
"logps/chosen": -482.7850646972656,
"logps/rejected": -568.3600463867188,
"loss": 0.4777,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8161367177963257,
"rewards/margins": 1.1671006679534912,
"rewards/rejected": -2.9832375049591064,
"step": 1040
},
{
"epoch": 0.2747971735147867,
"grad_norm": 10.500707626342773,
"learning_rate": 4.54984365705243e-06,
"logits/chosen": -1.5195045471191406,
"logits/rejected": -1.4307035207748413,
"logps/chosen": -455.4422912597656,
"logps/rejected": -578.11181640625,
"loss": 0.4731,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8703197240829468,
"rewards/margins": 1.292755126953125,
"rewards/rejected": -3.1630749702453613,
"step": 1050
},
{
"epoch": 0.27741428945302277,
"grad_norm": 8.591322898864746,
"learning_rate": 4.536680782597191e-06,
"logits/chosen": -1.5150272846221924,
"logits/rejected": -1.4561691284179688,
"logps/chosen": -402.49053955078125,
"logps/rejected": -479.2098083496094,
"loss": 0.5844,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6649020910263062,
"rewards/margins": 0.8239375352859497,
"rewards/rejected": -2.488839626312256,
"step": 1060
},
{
"epoch": 0.2800314053912588,
"grad_norm": 12.471731185913086,
"learning_rate": 4.523347845882718e-06,
"logits/chosen": -1.598962664604187,
"logits/rejected": -1.4552241563796997,
"logps/chosen": -423.9444274902344,
"logps/rejected": -476.2308654785156,
"loss": 0.4524,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3588608503341675,
"rewards/margins": 1.0158392190933228,
"rewards/rejected": -2.3747003078460693,
"step": 1070
},
{
"epoch": 0.2826485213294949,
"grad_norm": 8.733776092529297,
"learning_rate": 4.50984596020539e-06,
"logits/chosen": -1.2865254878997803,
"logits/rejected": -1.242490530014038,
"logps/chosen": -467.01776123046875,
"logps/rejected": -511.63507080078125,
"loss": 0.5623,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8692843914031982,
"rewards/margins": 0.7691918611526489,
"rewards/rejected": -2.6384763717651367,
"step": 1080
},
{
"epoch": 0.28526563726773096,
"grad_norm": 9.051119804382324,
"learning_rate": 4.4961762529687745e-06,
"logits/chosen": -1.2762900590896606,
"logits/rejected": -1.1670513153076172,
"logps/chosen": -481.23406982421875,
"logps/rejected": -555.0721435546875,
"loss": 0.5154,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2087130546569824,
"rewards/margins": 0.9038770794868469,
"rewards/rejected": -3.1125903129577637,
"step": 1090
},
{
"epoch": 0.287882753205967,
"grad_norm": 7.552842617034912,
"learning_rate": 4.482339865589492e-06,
"logits/chosen": -1.240541934967041,
"logits/rejected": -1.0851424932479858,
"logps/chosen": -478.9158630371094,
"logps/rejected": -492.19097900390625,
"loss": 0.59,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.127556562423706,
"rewards/margins": 0.5717890858650208,
"rewards/rejected": -2.699345827102661,
"step": 1100
},
{
"epoch": 0.287882753205967,
"eval_logits/chosen": -1.1099953651428223,
"eval_logits/rejected": -0.9899115562438965,
"eval_logps/chosen": -433.97100830078125,
"eval_logps/rejected": -478.8385314941406,
"eval_loss": 0.5277644991874695,
"eval_rewards/accuracies": 0.7300000190734863,
"eval_rewards/chosen": -1.693690299987793,
"eval_rewards/margins": 0.6489914059638977,
"eval_rewards/rejected": -2.342681646347046,
"eval_runtime": 1596.8217,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 1100
},
{
"epoch": 0.2904998691442031,
"grad_norm": 5.962321758270264,
"learning_rate": 4.468337953401909e-06,
"logits/chosen": -1.3310126066207886,
"logits/rejected": -1.2818472385406494,
"logps/chosen": -426.1422424316406,
"logps/rejected": -471.98175048828125,
"loss": 0.572,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.5579837560653687,
"rewards/margins": 0.5076408982276917,
"rewards/rejected": -2.065624713897705,
"step": 1110
},
{
"epoch": 0.29311698508243916,
"grad_norm": 5.4544477462768555,
"learning_rate": 4.45417168556166e-06,
"logits/chosen": -1.3914196491241455,
"logits/rejected": -1.305474877357483,
"logps/chosen": -344.81463623046875,
"logps/rejected": -411.66412353515625,
"loss": 0.5147,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0368053913116455,
"rewards/margins": 0.6763306856155396,
"rewards/rejected": -1.7131359577178955,
"step": 1120
},
{
"epoch": 0.2957341010206752,
"grad_norm": 12.404394149780273,
"learning_rate": 4.439842244948036e-06,
"logits/chosen": -1.3826755285263062,
"logits/rejected": -1.251558780670166,
"logps/chosen": -408.0195007324219,
"logps/rejected": -476.11114501953125,
"loss": 0.5433,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4701926708221436,
"rewards/margins": 0.7369771003723145,
"rewards/rejected": -2.207169771194458,
"step": 1130
},
{
"epoch": 0.29835121695891126,
"grad_norm": 28.483901977539062,
"learning_rate": 4.425350828065204e-06,
"logits/chosen": -1.342377781867981,
"logits/rejected": -1.165569543838501,
"logps/chosen": -453.5723571777344,
"logps/rejected": -509.80731201171875,
"loss": 0.4798,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.6950212717056274,
"rewards/margins": 1.041156530380249,
"rewards/rejected": -2.736177921295166,
"step": 1140
},
{
"epoch": 0.30096833289714736,
"grad_norm": 9.420351028442383,
"learning_rate": 4.410698644942303e-06,
"logits/chosen": -1.3900500535964966,
"logits/rejected": -1.2638782262802124,
"logps/chosen": -445.68646240234375,
"logps/rejected": -537.0733642578125,
"loss": 0.4689,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.7342853546142578,
"rewards/margins": 1.1538056135177612,
"rewards/rejected": -2.8880913257598877,
"step": 1150
},
{
"epoch": 0.3035854488353834,
"grad_norm": 11.15323543548584,
"learning_rate": 4.395886919032406e-06,
"logits/chosen": -1.3719291687011719,
"logits/rejected": -1.2517584562301636,
"logps/chosen": -444.84375,
"logps/rejected": -515.7125244140625,
"loss": 0.5478,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8111295700073242,
"rewards/margins": 0.9737447500228882,
"rewards/rejected": -2.784874439239502,
"step": 1160
},
{
"epoch": 0.30620256477361946,
"grad_norm": 11.627602577209473,
"learning_rate": 4.380916887110366e-06,
"logits/chosen": -1.5677311420440674,
"logits/rejected": -1.4189679622650146,
"logps/chosen": -418.11309814453125,
"logps/rejected": -474.2850646972656,
"loss": 0.5335,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5903016328811646,
"rewards/margins": 0.9656845331192017,
"rewards/rejected": -2.555985927581787,
"step": 1170
},
{
"epoch": 0.30881968071185556,
"grad_norm": 10.718832969665527,
"learning_rate": 4.365789799169539e-06,
"logits/chosen": -1.2974934577941895,
"logits/rejected": -1.3446776866912842,
"logps/chosen": -414.12152099609375,
"logps/rejected": -495.92340087890625,
"loss": 0.5394,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6758350133895874,
"rewards/margins": 0.7650316953659058,
"rewards/rejected": -2.440866708755493,
"step": 1180
},
{
"epoch": 0.3114367966500916,
"grad_norm": 8.005572319030762,
"learning_rate": 4.350506918317416e-06,
"logits/chosen": -1.4092941284179688,
"logits/rejected": -1.264432668685913,
"logps/chosen": -408.0471496582031,
"logps/rejected": -483.52960205078125,
"loss": 0.5148,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.605602502822876,
"rewards/margins": 0.8261173963546753,
"rewards/rejected": -2.431720018386841,
"step": 1190
},
{
"epoch": 0.31405391258832765,
"grad_norm": 7.756717681884766,
"learning_rate": 4.335069520670149e-06,
"logits/chosen": -1.2672417163848877,
"logits/rejected": -1.1932761669158936,
"logps/chosen": -391.0654602050781,
"logps/rejected": -470.7328186035156,
"loss": 0.5724,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.6141713857650757,
"rewards/margins": 0.7146965265274048,
"rewards/rejected": -2.3288679122924805,
"step": 1200
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -1.2473385334014893,
"eval_logits/rejected": -1.1348552703857422,
"eval_logps/chosen": -420.0863342285156,
"eval_logps/rejected": -485.28948974609375,
"eval_loss": 0.5071337819099426,
"eval_rewards/accuracies": 0.7379999756813049,
"eval_rewards/chosen": -1.5548440217971802,
"eval_rewards/margins": 0.8523474335670471,
"eval_rewards/rejected": -2.407191514968872,
"eval_runtime": 1597.3419,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 1200
},
{
"epoch": 0.3166710285265637,
"grad_norm": 5.270259857177734,
"learning_rate": 4.319478895246e-06,
"logits/chosen": -1.3534616231918335,
"logits/rejected": -1.1892000436782837,
"logps/chosen": -396.1217346191406,
"logps/rejected": -454.44378662109375,
"loss": 0.5071,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.490412950515747,
"rewards/margins": 0.8302758932113647,
"rewards/rejected": -2.3206887245178223,
"step": 1210
},
{
"epoch": 0.3192881444647998,
"grad_norm": 12.79388427734375,
"learning_rate": 4.303736343857704e-06,
"logits/chosen": -1.3265999555587769,
"logits/rejected": -1.2219207286834717,
"logps/chosen": -440.6163024902344,
"logps/rejected": -568.152099609375,
"loss": 0.4921,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.835211992263794,
"rewards/margins": 1.1568537950515747,
"rewards/rejected": -2.992065906524658,
"step": 1220
},
{
"epoch": 0.32190526040303585,
"grad_norm": 13.922798156738281,
"learning_rate": 4.287843181003772e-06,
"logits/chosen": -1.3599532842636108,
"logits/rejected": -1.237168550491333,
"logps/chosen": -486.5572204589844,
"logps/rejected": -522.0225219726562,
"loss": 0.5566,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9279435873031616,
"rewards/margins": 0.8177730441093445,
"rewards/rejected": -2.7457165718078613,
"step": 1230
},
{
"epoch": 0.3245223763412719,
"grad_norm": 7.278261184692383,
"learning_rate": 4.27180073375873e-06,
"logits/chosen": -1.3576252460479736,
"logits/rejected": -1.2608816623687744,
"logps/chosen": -445.9983825683594,
"logps/rejected": -495.5048828125,
"loss": 0.4942,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.587226152420044,
"rewards/margins": 0.910873293876648,
"rewards/rejected": -2.4980995655059814,
"step": 1240
},
{
"epoch": 0.327139492279508,
"grad_norm": 8.15560531616211,
"learning_rate": 4.255610341662304e-06,
"logits/chosen": -1.3735462427139282,
"logits/rejected": -1.210055947303772,
"logps/chosen": -444.36285400390625,
"logps/rejected": -524.8583374023438,
"loss": 0.5542,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.8914082050323486,
"rewards/margins": 1.0044190883636475,
"rewards/rejected": -2.895827293395996,
"step": 1250
},
{
"epoch": 0.32975660821774405,
"grad_norm": 10.175187110900879,
"learning_rate": 4.2392733566075764e-06,
"logits/chosen": -1.4199110269546509,
"logits/rejected": -1.317662000656128,
"logps/chosen": -492.12139892578125,
"logps/rejected": -561.2523803710938,
"loss": 0.5923,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.361861228942871,
"rewards/margins": 0.8468947410583496,
"rewards/rejected": -3.2087559700012207,
"step": 1260
},
{
"epoch": 0.3323737241559801,
"grad_norm": 8.98975944519043,
"learning_rate": 4.2227911427280975e-06,
"logits/chosen": -1.5209752321243286,
"logits/rejected": -1.3899322748184204,
"logps/chosen": -398.7652282714844,
"logps/rejected": -442.14892578125,
"loss": 0.5594,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4862077236175537,
"rewards/margins": 0.761127769947052,
"rewards/rejected": -2.247335195541382,
"step": 1270
},
{
"epoch": 0.33499084009421615,
"grad_norm": 9.442636489868164,
"learning_rate": 4.206165076283983e-06,
"logits/chosen": -1.5363355875015259,
"logits/rejected": -1.428397536277771,
"logps/chosen": -410.4297790527344,
"logps/rejected": -474.6888122558594,
"loss": 0.5101,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6462541818618774,
"rewards/margins": 0.8527911305427551,
"rewards/rejected": -2.4990451335906982,
"step": 1280
},
{
"epoch": 0.33760795603245225,
"grad_norm": 9.517970085144043,
"learning_rate": 4.189396545546995e-06,
"logits/chosen": -1.4206923246383667,
"logits/rejected": -1.3242676258087158,
"logps/chosen": -455.42474365234375,
"logps/rejected": -532.8533325195312,
"loss": 0.5055,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.0629706382751465,
"rewards/margins": 0.9739512205123901,
"rewards/rejected": -3.036921977996826,
"step": 1290
},
{
"epoch": 0.3402250719706883,
"grad_norm": 11.92896842956543,
"learning_rate": 4.172486950684627e-06,
"logits/chosen": -1.458961844444275,
"logits/rejected": -1.4081146717071533,
"logps/chosen": -429.62548828125,
"logps/rejected": -503.53582763671875,
"loss": 0.5457,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.7603120803833008,
"rewards/margins": 0.771033525466919,
"rewards/rejected": -2.5313456058502197,
"step": 1300
},
{
"epoch": 0.3402250719706883,
"eval_logits/chosen": -1.3403185606002808,
"eval_logits/rejected": -1.242436408996582,
"eval_logps/chosen": -440.03851318359375,
"eval_logps/rejected": -507.2138366699219,
"eval_loss": 0.5013459920883179,
"eval_rewards/accuracies": 0.7434999942779541,
"eval_rewards/chosen": -1.7543656826019287,
"eval_rewards/margins": 0.8720693588256836,
"eval_rewards/rejected": -2.6264350414276123,
"eval_runtime": 1597.2746,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 1300
},
{
"epoch": 0.34284218790892435,
"grad_norm": 7.771608352661133,
"learning_rate": 4.155437703643182e-06,
"logits/chosen": -1.5240622758865356,
"logits/rejected": -1.3838030099868774,
"logps/chosen": -408.34857177734375,
"logps/rejected": -462.7867736816406,
"loss": 0.5133,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6171270608901978,
"rewards/margins": 0.8352136611938477,
"rewards/rejected": -2.452340602874756,
"step": 1310
},
{
"epoch": 0.34545930384716045,
"grad_norm": 13.282800674438477,
"learning_rate": 4.138250228029882e-06,
"logits/chosen": -1.4314453601837158,
"logits/rejected": -1.3674726486206055,
"logps/chosen": -449.40045166015625,
"logps/rejected": -546.0065307617188,
"loss": 0.5029,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9253730773925781,
"rewards/margins": 0.9314772486686707,
"rewards/rejected": -2.8568501472473145,
"step": 1320
},
{
"epoch": 0.3480764197853965,
"grad_norm": 5.933595180511475,
"learning_rate": 4.120925958993994e-06,
"logits/chosen": -1.3017045259475708,
"logits/rejected": -1.251571536064148,
"logps/chosen": -431.25537109375,
"logps/rejected": -517.0537109375,
"loss": 0.557,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.9745893478393555,
"rewards/margins": 0.9014847874641418,
"rewards/rejected": -2.8760738372802734,
"step": 1330
},
{
"epoch": 0.35069353572363254,
"grad_norm": 11.526862144470215,
"learning_rate": 4.103466343106999e-06,
"logits/chosen": -1.3678683042526245,
"logits/rejected": -1.2736116647720337,
"logps/chosen": -471.77728271484375,
"logps/rejected": -537.7550048828125,
"loss": 0.5154,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.0238394737243652,
"rewards/margins": 0.9198586344718933,
"rewards/rejected": -2.9436984062194824,
"step": 1340
},
{
"epoch": 0.35331065166186865,
"grad_norm": 10.23589038848877,
"learning_rate": 4.085872838241797e-06,
"logits/chosen": -1.3284608125686646,
"logits/rejected": -1.217164397239685,
"logps/chosen": -450.1647033691406,
"logps/rejected": -507.2977600097656,
"loss": 0.569,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.8795936107635498,
"rewards/margins": 0.7713474631309509,
"rewards/rejected": -2.6509411334991455,
"step": 1350
},
{
"epoch": 0.3559277676001047,
"grad_norm": 9.470162391662598,
"learning_rate": 4.06814691345098e-06,
"logits/chosen": -1.387274980545044,
"logits/rejected": -1.2676749229431152,
"logps/chosen": -405.6870422363281,
"logps/rejected": -467.76708984375,
"loss": 0.4982,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.5023095607757568,
"rewards/margins": 0.8411477208137512,
"rewards/rejected": -2.3434574604034424,
"step": 1360
},
{
"epoch": 0.35854488353834074,
"grad_norm": 11.479103088378906,
"learning_rate": 4.050290048844171e-06,
"logits/chosen": -1.4606144428253174,
"logits/rejected": -1.400508165359497,
"logps/chosen": -422.5201721191406,
"logps/rejected": -496.3780212402344,
"loss": 0.5403,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5399492979049683,
"rewards/margins": 0.7900384068489075,
"rewards/rejected": -2.3299877643585205,
"step": 1370
},
{
"epoch": 0.3611619994765768,
"grad_norm": 9.197821617126465,
"learning_rate": 4.032303735464422e-06,
"logits/chosen": -1.6123807430267334,
"logits/rejected": -1.459542989730835,
"logps/chosen": -428.04095458984375,
"logps/rejected": -507.18243408203125,
"loss": 0.4608,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.5614588260650635,
"rewards/margins": 1.0217015743255615,
"rewards/rejected": -2.583160161972046,
"step": 1380
},
{
"epoch": 0.3637791154148129,
"grad_norm": 13.988523483276367,
"learning_rate": 4.014189475163727e-06,
"logits/chosen": -1.4627307653427124,
"logits/rejected": -1.3992432355880737,
"logps/chosen": -429.89642333984375,
"logps/rejected": -529.4677124023438,
"loss": 0.4813,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.753618836402893,
"rewards/margins": 1.0768356323242188,
"rewards/rejected": -2.8304543495178223,
"step": 1390
},
{
"epoch": 0.36639623135304894,
"grad_norm": 14.094236373901367,
"learning_rate": 3.995948780477605e-06,
"logits/chosen": -1.6445674896240234,
"logits/rejected": -1.5534647703170776,
"logps/chosen": -442.5335388183594,
"logps/rejected": -506.78253173828125,
"loss": 0.5423,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.7385400533676147,
"rewards/margins": 0.8896828889846802,
"rewards/rejected": -2.628222942352295,
"step": 1400
},
{
"epoch": 0.36639623135304894,
"eval_logits/chosen": -1.586852788925171,
"eval_logits/rejected": -1.5062702894210815,
"eval_logps/chosen": -428.40972900390625,
"eval_logps/rejected": -505.70770263671875,
"eval_loss": 0.5131703019142151,
"eval_rewards/accuracies": 0.7210000157356262,
"eval_rewards/chosen": -1.6380778551101685,
"eval_rewards/margins": 0.9732955098152161,
"eval_rewards/rejected": -2.6113734245300293,
"eval_runtime": 1597.5064,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 1400
},
{
"epoch": 0.369013347291285,
"grad_norm": 13.169591903686523,
"learning_rate": 3.977583174498816e-06,
"logits/chosen": -1.6033601760864258,
"logits/rejected": -1.529076099395752,
"logps/chosen": -446.4644470214844,
"logps/rejected": -562.0938720703125,
"loss": 0.4006,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8283302783966064,
"rewards/margins": 1.3763213157653809,
"rewards/rejected": -3.2046515941619873,
"step": 1410
},
{
"epoch": 0.3716304632295211,
"grad_norm": 12.735097885131836,
"learning_rate": 3.959094190750172e-06,
"logits/chosen": -1.6629726886749268,
"logits/rejected": -1.5727919340133667,
"logps/chosen": -460.08013916015625,
"logps/rejected": -542.1484375,
"loss": 0.5034,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7491388320922852,
"rewards/margins": 1.091073989868164,
"rewards/rejected": -2.8402130603790283,
"step": 1420
},
{
"epoch": 0.37424757916775714,
"grad_norm": 9.543456077575684,
"learning_rate": 3.9404833730564975e-06,
"logits/chosen": -1.5923559665679932,
"logits/rejected": -1.5246838331222534,
"logps/chosen": -397.0897216796875,
"logps/rejected": -477.6280212402344,
"loss": 0.5221,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.4408881664276123,
"rewards/margins": 0.9207174181938171,
"rewards/rejected": -2.361605405807495,
"step": 1430
},
{
"epoch": 0.3768646951059932,
"grad_norm": 15.872808456420898,
"learning_rate": 3.921752275415712e-06,
"logits/chosen": -1.6502765417099,
"logits/rejected": -1.5927629470825195,
"logps/chosen": -403.17657470703125,
"logps/rejected": -513.6209106445312,
"loss": 0.4272,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.527881145477295,
"rewards/margins": 1.3093104362487793,
"rewards/rejected": -2.837191581726074,
"step": 1440
},
{
"epoch": 0.37948181104422923,
"grad_norm": 13.932242393493652,
"learning_rate": 3.902902461869079e-06,
"logits/chosen": -1.6036418676376343,
"logits/rejected": -1.5119550228118896,
"logps/chosen": -423.63275146484375,
"logps/rejected": -522.7612915039062,
"loss": 0.5484,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8413972854614258,
"rewards/margins": 1.1561634540557861,
"rewards/rejected": -2.997560501098633,
"step": 1450
},
{
"epoch": 0.38209892698246534,
"grad_norm": 17.43979835510254,
"learning_rate": 3.883935506370605e-06,
"logits/chosen": -1.6155637502670288,
"logits/rejected": -1.546623706817627,
"logps/chosen": -433.8269958496094,
"logps/rejected": -493.6173400878906,
"loss": 0.5628,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8069992065429688,
"rewards/margins": 0.8752773404121399,
"rewards/rejected": -2.682276487350464,
"step": 1460
},
{
"epoch": 0.3847160429207014,
"grad_norm": 6.709349155426025,
"learning_rate": 3.864852992655617e-06,
"logits/chosen": -1.6773513555526733,
"logits/rejected": -1.6156046390533447,
"logps/chosen": -408.98004150390625,
"logps/rejected": -496.5398864746094,
"loss": 0.4636,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.6042149066925049,
"rewards/margins": 0.9984095692634583,
"rewards/rejected": -2.6026244163513184,
"step": 1470
},
{
"epoch": 0.38733315885893743,
"grad_norm": 5.378371715545654,
"learning_rate": 3.845656514108516e-06,
"logits/chosen": -1.6842008829116821,
"logits/rejected": -1.5901457071304321,
"logps/chosen": -467.2499084472656,
"logps/rejected": -508.35491943359375,
"loss": 0.5207,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0820114612579346,
"rewards/margins": 0.9600709080696106,
"rewards/rejected": -3.0420823097229004,
"step": 1480
},
{
"epoch": 0.38995027479717354,
"grad_norm": 14.073667526245117,
"learning_rate": 3.826347673629738e-06,
"logits/chosen": -1.6913728713989258,
"logits/rejected": -1.5630801916122437,
"logps/chosen": -425.8887634277344,
"logps/rejected": -511.43194580078125,
"loss": 0.4841,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7265421152114868,
"rewards/margins": 1.118648648262024,
"rewards/rejected": -2.84519100189209,
"step": 1490
},
{
"epoch": 0.3925673907354096,
"grad_norm": 15.713250160217285,
"learning_rate": 3.8069280835019062e-06,
"logits/chosen": -1.631744384765625,
"logits/rejected": -1.5060975551605225,
"logps/chosen": -430.0010681152344,
"logps/rejected": -537.3772583007812,
"loss": 0.4492,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.657666563987732,
"rewards/margins": 1.2653546333312988,
"rewards/rejected": -2.9230213165283203,
"step": 1500
},
{
"epoch": 0.3925673907354096,
"eval_logits/chosen": -1.5949609279632568,
"eval_logits/rejected": -1.4971818923950195,
"eval_logps/chosen": -423.41754150390625,
"eval_logps/rejected": -503.4827575683594,
"eval_loss": 0.5122300386428833,
"eval_rewards/accuracies": 0.7260000109672546,
"eval_rewards/chosen": -1.5881556272506714,
"eval_rewards/margins": 1.0009682178497314,
"eval_rewards/rejected": -2.5891237258911133,
"eval_runtime": 1597.6931,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 1500
},
{
"epoch": 0.39518450667364563,
"grad_norm": 14.919211387634277,
"learning_rate": 3.7873993652552077e-06,
"logits/chosen": -1.6476099491119385,
"logits/rejected": -1.5861533880233765,
"logps/chosen": -370.7307434082031,
"logps/rejected": -437.010986328125,
"loss": 0.5933,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.370052695274353,
"rewards/margins": 0.7218869924545288,
"rewards/rejected": -2.091939926147461,
"step": 1510
},
{
"epoch": 0.39780162261188173,
"grad_norm": 6.893444061279297,
"learning_rate": 3.7677631495319953e-06,
"logits/chosen": -1.7386844158172607,
"logits/rejected": -1.6610181331634521,
"logps/chosen": -358.55792236328125,
"logps/rejected": -410.8779296875,
"loss": 0.5166,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9495410919189453,
"rewards/margins": 0.7035711407661438,
"rewards/rejected": -1.6531124114990234,
"step": 1520
},
{
"epoch": 0.4004187385501178,
"grad_norm": 10.669229507446289,
"learning_rate": 3.748021075950633e-06,
"logits/chosen": -1.6956592798233032,
"logits/rejected": -1.634338617324829,
"logps/chosen": -401.1066589355469,
"logps/rejected": -450.43701171875,
"loss": 0.5677,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2650973796844482,
"rewards/margins": 0.5999857187271118,
"rewards/rejected": -1.8650833368301392,
"step": 1530
},
{
"epoch": 0.40303585448835383,
"grad_norm": 13.775323867797852,
"learning_rate": 3.7281747929685824e-06,
"logits/chosen": -1.4220095872879028,
"logits/rejected": -1.3044965267181396,
"logps/chosen": -428.1224060058594,
"logps/rejected": -506.9508361816406,
"loss": 0.5483,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.9053666591644287,
"rewards/margins": 0.9458476305007935,
"rewards/rejected": -2.8512144088745117,
"step": 1540
},
{
"epoch": 0.4056529704265899,
"grad_norm": 14.96325397491455,
"learning_rate": 3.7082259577447604e-06,
"logits/chosen": -1.4775934219360352,
"logits/rejected": -1.3828151226043701,
"logps/chosen": -484.23712158203125,
"logps/rejected": -562.1600341796875,
"loss": 0.4745,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.1311516761779785,
"rewards/margins": 0.9796358942985535,
"rewards/rejected": -3.1107876300811768,
"step": 1550
},
{
"epoch": 0.408270086364826,
"grad_norm": 12.222273826599121,
"learning_rate": 3.6881762360011688e-06,
"logits/chosen": -1.5065914392471313,
"logits/rejected": -1.3439867496490479,
"logps/chosen": -507.56646728515625,
"logps/rejected": -579.8803100585938,
"loss": 0.5187,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.261667013168335,
"rewards/margins": 1.0928138494491577,
"rewards/rejected": -3.3544812202453613,
"step": 1560
},
{
"epoch": 0.410887202303062,
"grad_norm": 9.984075546264648,
"learning_rate": 3.668027301883802e-06,
"logits/chosen": -1.5400466918945312,
"logits/rejected": -1.42914617061615,
"logps/chosen": -428.8944396972656,
"logps/rejected": -516.1829833984375,
"loss": 0.4994,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7761850357055664,
"rewards/margins": 1.021864891052246,
"rewards/rejected": -2.7980499267578125,
"step": 1570
},
{
"epoch": 0.4135043182412981,
"grad_norm": 8.096105575561523,
"learning_rate": 3.64778083782286e-06,
"logits/chosen": -1.4683252573013306,
"logits/rejected": -1.4568402767181396,
"logps/chosen": -420.28662109375,
"logps/rejected": -531.93310546875,
"loss": 0.5363,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.67800772190094,
"rewards/margins": 0.8264617919921875,
"rewards/rejected": -2.504469394683838,
"step": 1580
},
{
"epoch": 0.4161214341795342,
"grad_norm": 8.454867362976074,
"learning_rate": 3.627438534392268e-06,
"logits/chosen": -1.563232421875,
"logits/rejected": -1.5413362979888916,
"logps/chosen": -382.3138122558594,
"logps/rejected": -473.07427978515625,
"loss": 0.4969,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4439641237258911,
"rewards/margins": 0.8402025103569031,
"rewards/rejected": -2.2841668128967285,
"step": 1590
},
{
"epoch": 0.4187385501177702,
"grad_norm": 8.18990421295166,
"learning_rate": 3.607002090168506e-06,
"logits/chosen": -1.374459981918335,
"logits/rejected": -1.3057047128677368,
"logps/chosen": -437.67437744140625,
"logps/rejected": -498.42022705078125,
"loss": 0.5491,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.7346376180648804,
"rewards/margins": 0.8208419680595398,
"rewards/rejected": -2.5554797649383545,
"step": 1600
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -1.2524830102920532,
"eval_logits/rejected": -1.129266381263733,
"eval_logps/chosen": -434.1912536621094,
"eval_logps/rejected": -515.1350708007812,
"eval_loss": 0.49564477801322937,
"eval_rewards/accuracies": 0.7394999861717224,
"eval_rewards/chosen": -1.6958929300308228,
"eval_rewards/margins": 1.0097541809082031,
"eval_rewards/rejected": -2.7056469917297363,
"eval_runtime": 1597.9006,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 1600
},
{
"epoch": 0.4213556660560063,
"grad_norm": 5.6236395835876465,
"learning_rate": 3.586473211588787e-06,
"logits/chosen": -1.3308565616607666,
"logits/rejected": -1.2487056255340576,
"logps/chosen": -416.17218017578125,
"logps/rejected": -541.766357421875,
"loss": 0.4321,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.7147547006607056,
"rewards/margins": 1.190341591835022,
"rewards/rejected": -2.9050965309143066,
"step": 1610
},
{
"epoch": 0.4239727819942423,
"grad_norm": 17.16832160949707,
"learning_rate": 3.5658536128085623e-06,
"logits/chosen": -1.3294860124588013,
"logits/rejected": -1.1633020639419556,
"logps/chosen": -514.1262817382812,
"logps/rejected": -586.5099487304688,
"loss": 0.5903,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.5135364532470703,
"rewards/margins": 1.0195398330688477,
"rewards/rejected": -3.533076047897339,
"step": 1620
},
{
"epoch": 0.4265898979324784,
"grad_norm": 9.197494506835938,
"learning_rate": 3.545145015558399e-06,
"logits/chosen": -1.1497808694839478,
"logits/rejected": -1.1394188404083252,
"logps/chosen": -455.3041076660156,
"logps/rejected": -552.9832763671875,
"loss": 0.5097,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.2378275394439697,
"rewards/margins": 1.1191097497940063,
"rewards/rejected": -3.3569374084472656,
"step": 1630
},
{
"epoch": 0.42920701387071447,
"grad_norm": 5.227996826171875,
"learning_rate": 3.5243491490002056e-06,
"logits/chosen": -1.344987154006958,
"logits/rejected": -1.2656126022338867,
"logps/chosen": -442.0404357910156,
"logps/rejected": -519.5951538085938,
"loss": 0.5867,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.8838779926300049,
"rewards/margins": 0.8285346031188965,
"rewards/rejected": -2.7124123573303223,
"step": 1640
},
{
"epoch": 0.4318241298089505,
"grad_norm": 6.703832149505615,
"learning_rate": 3.503467749582857e-06,
"logits/chosen": -1.4452259540557861,
"logits/rejected": -1.273822546005249,
"logps/chosen": -400.95703125,
"logps/rejected": -432.19775390625,
"loss": 0.5857,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.4979441165924072,
"rewards/margins": 0.6527599096298218,
"rewards/rejected": -2.1507039070129395,
"step": 1650
},
{
"epoch": 0.4344412457471866,
"grad_norm": 11.32523250579834,
"learning_rate": 3.4825025608971947e-06,
"logits/chosen": -1.386530876159668,
"logits/rejected": -1.324684500694275,
"logps/chosen": -355.1617736816406,
"logps/rejected": -427.02001953125,
"loss": 0.5318,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3329139947891235,
"rewards/margins": 0.6805238723754883,
"rewards/rejected": -2.0134379863739014,
"step": 1660
},
{
"epoch": 0.43705836168542267,
"grad_norm": 7.073009490966797,
"learning_rate": 3.4614553335304407e-06,
"logits/chosen": -1.3409563302993774,
"logits/rejected": -1.1487702131271362,
"logps/chosen": -427.2156677246094,
"logps/rejected": -493.6348571777344,
"loss": 0.4728,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.584120273590088,
"rewards/margins": 0.9814236760139465,
"rewards/rejected": -2.5655438899993896,
"step": 1670
},
{
"epoch": 0.4396754776236587,
"grad_norm": 11.820454597473145,
"learning_rate": 3.4403278249200222e-06,
"logits/chosen": -1.262634038925171,
"logits/rejected": -1.0713304281234741,
"logps/chosen": -481.8815002441406,
"logps/rejected": -564.7189331054688,
"loss": 0.4399,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9938242435455322,
"rewards/margins": 1.2474219799041748,
"rewards/rejected": -3.241246461868286,
"step": 1680
},
{
"epoch": 0.44229259356189476,
"grad_norm": 19.43402671813965,
"learning_rate": 3.4191217992068293e-06,
"logits/chosen": -1.2867403030395508,
"logits/rejected": -1.1040995121002197,
"logps/chosen": -548.404296875,
"logps/rejected": -614.572998046875,
"loss": 0.5467,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.703883647918701,
"rewards/margins": 1.1177372932434082,
"rewards/rejected": -3.821620464324951,
"step": 1690
},
{
"epoch": 0.44490970950013087,
"grad_norm": 18.481212615966797,
"learning_rate": 3.3978390270879056e-06,
"logits/chosen": -1.169626235961914,
"logits/rejected": -1.0800492763519287,
"logps/chosen": -525.806884765625,
"logps/rejected": -649.9021606445312,
"loss": 0.5408,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.1594154834747314,
"rewards/margins": 1.1724364757537842,
"rewards/rejected": -4.331852436065674,
"step": 1700
},
{
"epoch": 0.44490970950013087,
"eval_logits/chosen": -1.1773556470870972,
"eval_logits/rejected": -1.0519527196884155,
"eval_logps/chosen": -568.2142333984375,
"eval_logps/rejected": -668.4868774414062,
"eval_loss": 0.5111355781555176,
"eval_rewards/accuracies": 0.7304999828338623,
"eval_rewards/chosen": -3.036123275756836,
"eval_rewards/margins": 1.203041672706604,
"eval_rewards/rejected": -4.23916482925415,
"eval_runtime": 1598.229,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 1700
},
{
"epoch": 0.4475268254383669,
"grad_norm": 17.149526596069336,
"learning_rate": 3.3764812856685995e-06,
"logits/chosen": -1.3090331554412842,
"logits/rejected": -1.292729139328003,
"logps/chosen": -502.16497802734375,
"logps/rejected": -622.8278198242188,
"loss": 0.5266,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.7771124839782715,
"rewards/margins": 1.0713131427764893,
"rewards/rejected": -3.8484256267547607,
"step": 1710
},
{
"epoch": 0.45014394137660296,
"grad_norm": 12.321800231933594,
"learning_rate": 3.3550503583141726e-06,
"logits/chosen": -1.5342642068862915,
"logits/rejected": -1.4036284685134888,
"logps/chosen": -507.0011291503906,
"logps/rejected": -602.5591430664062,
"loss": 0.497,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3824760913848877,
"rewards/margins": 1.1078113317489624,
"rewards/rejected": -3.4902870655059814,
"step": 1720
},
{
"epoch": 0.45276105731483907,
"grad_norm": 7.4277191162109375,
"learning_rate": 3.3335480345008907e-06,
"logits/chosen": -1.5070991516113281,
"logits/rejected": -1.4292500019073486,
"logps/chosen": -427.82818603515625,
"logps/rejected": -509.09552001953125,
"loss": 0.4635,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6776756048202515,
"rewards/margins": 1.0597718954086304,
"rewards/rejected": -2.737447500228882,
"step": 1730
},
{
"epoch": 0.4553781732530751,
"grad_norm": 9.2946195602417,
"learning_rate": 3.3119761096666055e-06,
"logits/chosen": -1.5444905757904053,
"logits/rejected": -1.4140485525131226,
"logps/chosen": -468.56182861328125,
"logps/rejected": -515.2850341796875,
"loss": 0.5735,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9366995096206665,
"rewards/margins": 0.7699880599975586,
"rewards/rejected": -2.7066876888275146,
"step": 1740
},
{
"epoch": 0.45799528919131116,
"grad_norm": 7.504373073577881,
"learning_rate": 3.290336385060832e-06,
"logits/chosen": -1.724323034286499,
"logits/rejected": -1.5582685470581055,
"logps/chosen": -436.670166015625,
"logps/rejected": -503.16705322265625,
"loss": 0.5256,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8786998987197876,
"rewards/margins": 0.8662854433059692,
"rewards/rejected": -2.7449851036071777,
"step": 1750
},
{
"epoch": 0.46061240512954726,
"grad_norm": 11.6707181930542,
"learning_rate": 3.268630667594348e-06,
"logits/chosen": -1.552310824394226,
"logits/rejected": -1.5216782093048096,
"logps/chosen": -447.37835693359375,
"logps/rejected": -520.7014770507812,
"loss": 0.507,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9179880619049072,
"rewards/margins": 0.9757116436958313,
"rewards/rejected": -2.8936996459960938,
"step": 1760
},
{
"epoch": 0.4632295210677833,
"grad_norm": 10.357820510864258,
"learning_rate": 3.2468607696883147e-06,
"logits/chosen": -1.6096198558807373,
"logits/rejected": -1.5589017868041992,
"logps/chosen": -452.2684631347656,
"logps/rejected": -553.5985717773438,
"loss": 0.4902,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9917962551116943,
"rewards/margins": 0.9700411558151245,
"rewards/rejected": -2.9618372917175293,
"step": 1770
},
{
"epoch": 0.46584663700601936,
"grad_norm": 8.833939552307129,
"learning_rate": 3.225028509122944e-06,
"logits/chosen": -1.7435325384140015,
"logits/rejected": -1.6519057750701904,
"logps/chosen": -414.03179931640625,
"logps/rejected": -483.228515625,
"loss": 0.5437,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.7762641906738281,
"rewards/margins": 0.8044807314872742,
"rewards/rejected": -2.580744504928589,
"step": 1780
},
{
"epoch": 0.4684637529442554,
"grad_norm": 9.815378189086914,
"learning_rate": 3.2031357088857083e-06,
"logits/chosen": -1.6989177465438843,
"logits/rejected": -1.6307258605957031,
"logps/chosen": -467.5003967285156,
"logps/rejected": -558.3304443359375,
"loss": 0.5356,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.923940896987915,
"rewards/margins": 0.9426455497741699,
"rewards/rejected": -2.866586446762085,
"step": 1790
},
{
"epoch": 0.4710808688824915,
"grad_norm": 10.215496063232422,
"learning_rate": 3.181184197019127e-06,
"logits/chosen": -1.4613163471221924,
"logits/rejected": -1.3699003458023071,
"logps/chosen": -425.10430908203125,
"logps/rejected": -576.7496948242188,
"loss": 0.4705,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9493353366851807,
"rewards/margins": 1.254817247390747,
"rewards/rejected": -3.2041525840759277,
"step": 1800
},
{
"epoch": 0.4710808688824915,
"eval_logits/chosen": -1.4508103132247925,
"eval_logits/rejected": -1.3478518724441528,
"eval_logps/chosen": -476.96630859375,
"eval_logps/rejected": -563.5121459960938,
"eval_loss": 0.49488988518714905,
"eval_rewards/accuracies": 0.7434999942779541,
"eval_rewards/chosen": -2.123643636703491,
"eval_rewards/margins": 1.0657742023468018,
"eval_rewards/rejected": -3.189417839050293,
"eval_runtime": 1598.3976,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 1800
},
{
"epoch": 0.47369798482072756,
"grad_norm": 19.882471084594727,
"learning_rate": 3.159175806468126e-06,
"logits/chosen": -1.4092731475830078,
"logits/rejected": -1.2594877481460571,
"logps/chosen": -483.1397399902344,
"logps/rejected": -564.5753173828125,
"loss": 0.4771,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.345776081085205,
"rewards/margins": 1.1159356832504272,
"rewards/rejected": -3.4617114067077637,
"step": 1810
},
{
"epoch": 0.4763151007589636,
"grad_norm": 13.595392227172852,
"learning_rate": 3.1371123749269804e-06,
"logits/chosen": -1.4161027669906616,
"logits/rejected": -1.3486690521240234,
"logps/chosen": -579.3997192382812,
"logps/rejected": -658.3812255859375,
"loss": 0.588,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.961474895477295,
"rewards/margins": 0.9501525163650513,
"rewards/rejected": -3.9116275310516357,
"step": 1820
},
{
"epoch": 0.4789322166971997,
"grad_norm": 16.07357406616211,
"learning_rate": 3.114995744685877e-06,
"logits/chosen": -1.3573070764541626,
"logits/rejected": -1.3251278400421143,
"logps/chosen": -537.996826171875,
"logps/rejected": -629.4193115234375,
"loss": 0.5297,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.90761399269104,
"rewards/margins": 1.0439841747283936,
"rewards/rejected": -3.9515984058380127,
"step": 1830
},
{
"epoch": 0.48154933263543576,
"grad_norm": 6.550096035003662,
"learning_rate": 3.0928277624770743e-06,
"logits/chosen": -1.5519769191741943,
"logits/rejected": -1.4119082689285278,
"logps/chosen": -502.74102783203125,
"logps/rejected": -586.8773193359375,
"loss": 0.5118,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1812124252319336,
"rewards/margins": 1.1252543926239014,
"rewards/rejected": -3.306466579437256,
"step": 1840
},
{
"epoch": 0.4841664485736718,
"grad_norm": 7.670591354370117,
"learning_rate": 3.070610279320708e-06,
"logits/chosen": -1.464906930923462,
"logits/rejected": -1.32064950466156,
"logps/chosen": -504.308349609375,
"logps/rejected": -599.1265869140625,
"loss": 0.4443,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1736900806427,
"rewards/margins": 1.1892703771591187,
"rewards/rejected": -3.3629603385925293,
"step": 1850
},
{
"epoch": 0.48678356451190785,
"grad_norm": 9.109015464782715,
"learning_rate": 3.0483451503702264e-06,
"logits/chosen": -1.249629020690918,
"logits/rejected": -1.168210744857788,
"logps/chosen": -578.9190063476562,
"logps/rejected": -669.9890747070312,
"loss": 0.5669,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.928832530975342,
"rewards/margins": 1.1118719577789307,
"rewards/rejected": -4.040704250335693,
"step": 1860
},
{
"epoch": 0.48940068045014395,
"grad_norm": 10.058290481567383,
"learning_rate": 3.0260342347574916e-06,
"logits/chosen": -1.2091599702835083,
"logits/rejected": -1.0555975437164307,
"logps/chosen": -534.0882568359375,
"logps/rejected": -660.2562255859375,
"loss": 0.4437,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.618621826171875,
"rewards/margins": 1.4526309967041016,
"rewards/rejected": -4.071252822875977,
"step": 1870
},
{
"epoch": 0.49201779638838,
"grad_norm": 10.198968887329102,
"learning_rate": 3.0036793954375358e-06,
"logits/chosen": -1.303740382194519,
"logits/rejected": -1.155242681503296,
"logps/chosen": -507.42529296875,
"logps/rejected": -583.2550048828125,
"loss": 0.4895,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.374905824661255,
"rewards/margins": 1.1803052425384521,
"rewards/rejected": -3.555210828781128,
"step": 1880
},
{
"epoch": 0.49463491232661605,
"grad_norm": 10.858015060424805,
"learning_rate": 2.981282499033009e-06,
"logits/chosen": -1.3813010454177856,
"logits/rejected": -1.2690826654434204,
"logps/chosen": -486.0970153808594,
"logps/rejected": -569.85888671875,
"loss": 0.5031,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.0988192558288574,
"rewards/margins": 1.0918813943862915,
"rewards/rejected": -3.1907010078430176,
"step": 1890
},
{
"epoch": 0.49725202826485215,
"grad_norm": 10.221612930297852,
"learning_rate": 2.9588454156783163e-06,
"logits/chosen": -1.4239078760147095,
"logits/rejected": -1.2654526233673096,
"logps/chosen": -474.64825439453125,
"logps/rejected": -587.3582763671875,
"loss": 0.4447,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9338430166244507,
"rewards/margins": 1.3446595668792725,
"rewards/rejected": -3.2785022258758545,
"step": 1900
},
{
"epoch": 0.49725202826485215,
"eval_logits/chosen": -1.2950754165649414,
"eval_logits/rejected": -1.1710810661315918,
"eval_logps/chosen": -468.10113525390625,
"eval_logps/rejected": -559.6229248046875,
"eval_loss": 0.49835336208343506,
"eval_rewards/accuracies": 0.7419999837875366,
"eval_rewards/chosen": -2.0349912643432617,
"eval_rewards/margins": 1.1155344247817993,
"eval_rewards/rejected": -3.1505255699157715,
"eval_runtime": 1598.3655,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 1900
},
{
"epoch": 0.4998691442030882,
"grad_norm": 12.752076148986816,
"learning_rate": 2.9363700188634597e-06,
"logits/chosen": -1.3872668743133545,
"logits/rejected": -1.265047311782837,
"logps/chosen": -489.17291259765625,
"logps/rejected": -553.0140991210938,
"loss": 0.5231,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.242345094680786,
"rewards/margins": 1.008666753768921,
"rewards/rejected": -3.251011610031128,
"step": 1910
},
{
"epoch": 0.5024862601413242,
"grad_norm": 10.04758071899414,
"learning_rate": 2.9138581852776053e-06,
"logits/chosen": -1.368890404701233,
"logits/rejected": -1.2567319869995117,
"logps/chosen": -493.54638671875,
"logps/rejected": -586.2965087890625,
"loss": 0.5251,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.339406728744507,
"rewards/margins": 1.0563156604766846,
"rewards/rejected": -3.3957226276397705,
"step": 1920
},
{
"epoch": 0.5051033760795604,
"grad_norm": 9.104912757873535,
"learning_rate": 2.8913117946523805e-06,
"logits/chosen": -1.4171912670135498,
"logits/rejected": -1.2455161809921265,
"logps/chosen": -490.3606872558594,
"logps/rejected": -558.6239013671875,
"loss": 0.4792,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.2399258613586426,
"rewards/margins": 1.0637315511703491,
"rewards/rejected": -3.3036580085754395,
"step": 1930
},
{
"epoch": 0.5077204920177963,
"grad_norm": 11.299101829528809,
"learning_rate": 2.8687327296049126e-06,
"logits/chosen": -1.4033154249191284,
"logits/rejected": -1.307733178138733,
"logps/chosen": -491.1643981933594,
"logps/rejected": -589.1052856445312,
"loss": 0.5129,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.32118558883667,
"rewards/margins": 1.0772285461425781,
"rewards/rejected": -3.398413896560669,
"step": 1940
},
{
"epoch": 0.5103376079560324,
"grad_norm": 18.843233108520508,
"learning_rate": 2.8461228754806376e-06,
"logits/chosen": -1.4243779182434082,
"logits/rejected": -1.2675760984420776,
"logps/chosen": -507.2261657714844,
"logps/rejected": -582.9666748046875,
"loss": 0.5307,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.3146495819091797,
"rewards/margins": 1.0160276889801025,
"rewards/rejected": -3.330677032470703,
"step": 1950
},
{
"epoch": 0.5129547238942685,
"grad_norm": 9.132381439208984,
"learning_rate": 2.823484120195865e-06,
"logits/chosen": -1.4892734289169312,
"logits/rejected": -1.310719609260559,
"logps/chosen": -494.809814453125,
"logps/rejected": -583.18798828125,
"loss": 0.4256,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.1708030700683594,
"rewards/margins": 1.2277300357818604,
"rewards/rejected": -3.398533582687378,
"step": 1960
},
{
"epoch": 0.5155718398325045,
"grad_norm": 13.682202339172363,
"learning_rate": 2.8008183540801486e-06,
"logits/chosen": -1.2465951442718506,
"logits/rejected": -1.113993525505066,
"logps/chosen": -583.8721313476562,
"logps/rejected": -649.005126953125,
"loss": 0.5177,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.103384494781494,
"rewards/margins": 1.1054545640945435,
"rewards/rejected": -4.208839416503906,
"step": 1970
},
{
"epoch": 0.5181889557707406,
"grad_norm": 13.151285171508789,
"learning_rate": 2.7781274697184353e-06,
"logits/chosen": -1.1965068578720093,
"logits/rejected": -1.2104531526565552,
"logps/chosen": -526.9979248046875,
"logps/rejected": -667.6734619140625,
"loss": 0.5585,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.9773271083831787,
"rewards/margins": 1.2037287950515747,
"rewards/rejected": -4.181056022644043,
"step": 1980
},
{
"epoch": 0.5208060717089767,
"grad_norm": 7.820137023925781,
"learning_rate": 2.7554133617930397e-06,
"logits/chosen": -1.4265800714492798,
"logits/rejected": -1.3070918321609497,
"logps/chosen": -457.6055603027344,
"logps/rejected": -541.1328735351562,
"loss": 0.5257,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.098328113555908,
"rewards/margins": 0.9863009452819824,
"rewards/rejected": -3.0846290588378906,
"step": 1990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 12.656340599060059,
"learning_rate": 2.7326779269254363e-06,
"logits/chosen": -1.6775627136230469,
"logits/rejected": -1.5273171663284302,
"logps/chosen": -465.67138671875,
"logps/rejected": -513.0640869140625,
"loss": 0.4561,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7640680074691772,
"rewards/margins": 1.0338157415390015,
"rewards/rejected": -2.797883987426758,
"step": 2000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -1.4695755243301392,
"eval_logits/rejected": -1.3557151556015015,
"eval_logps/chosen": -461.283935546875,
"eval_logps/rejected": -540.4462280273438,
"eval_loss": 0.4929336607456207,
"eval_rewards/accuracies": 0.7419999837875366,
"eval_rewards/chosen": -1.9668195247650146,
"eval_rewards/margins": 0.9919391870498657,
"eval_rewards/rejected": -2.95875883102417,
"eval_runtime": 1598.5788,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 2000
},
{
"epoch": 0.5260403035854488,
"grad_norm": 10.136625289916992,
"learning_rate": 2.7099230635178954e-06,
"logits/chosen": -1.506194829940796,
"logits/rejected": -1.4691191911697388,
"logps/chosen": -475.35992431640625,
"logps/rejected": -562.2814331054688,
"loss": 0.5025,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.106142520904541,
"rewards/margins": 0.9343917965888977,
"rewards/rejected": -3.040534257888794,
"step": 2010
},
{
"epoch": 0.528657419523685,
"grad_norm": 10.457864761352539,
"learning_rate": 2.6871506715949608e-06,
"logits/chosen": -1.5337005853652954,
"logits/rejected": -1.3907279968261719,
"logps/chosen": -485.59637451171875,
"logps/rejected": -580.8443603515625,
"loss": 0.4664,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.3269832134246826,
"rewards/margins": 1.1330835819244385,
"rewards/rejected": -3.4600670337677,
"step": 2020
},
{
"epoch": 0.5312745354619209,
"grad_norm": 7.218365669250488,
"learning_rate": 2.6643626526448063e-06,
"logits/chosen": -1.4807672500610352,
"logits/rejected": -1.308292031288147,
"logps/chosen": -573.4922485351562,
"logps/rejected": -676.8171997070312,
"loss": 0.4288,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.787353038787842,
"rewards/margins": 1.456699252128601,
"rewards/rejected": -4.244051933288574,
"step": 2030
},
{
"epoch": 0.533891651400157,
"grad_norm": 14.370759010314941,
"learning_rate": 2.6415609094604562e-06,
"logits/chosen": -1.194697380065918,
"logits/rejected": -1.1315624713897705,
"logps/chosen": -666.68896484375,
"logps/rejected": -775.71240234375,
"loss": 0.5124,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.9625601768493652,
"rewards/margins": 1.2808544635772705,
"rewards/rejected": -5.243414878845215,
"step": 2040
},
{
"epoch": 0.5365087673383931,
"grad_norm": 12.651047706604004,
"learning_rate": 2.618747345980904e-06,
"logits/chosen": -1.1674007177352905,
"logits/rejected": -0.9596608877182007,
"logps/chosen": -729.123291015625,
"logps/rejected": -793.8466796875,
"loss": 0.547,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -4.807937145233154,
"rewards/margins": 1.1525070667266846,
"rewards/rejected": -5.960444450378418,
"step": 2050
},
{
"epoch": 0.5391258832766291,
"grad_norm": 6.089327335357666,
"learning_rate": 2.595923867132136e-06,
"logits/chosen": -1.2956593036651611,
"logits/rejected": -1.1621012687683105,
"logps/chosen": -649.8438720703125,
"logps/rejected": -758.2276000976562,
"loss": 0.4846,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.689380645751953,
"rewards/margins": 1.3314837217330933,
"rewards/rejected": -5.020864009857178,
"step": 2060
},
{
"epoch": 0.5417429992148652,
"grad_norm": 7.40191650390625,
"learning_rate": 2.5730923786680672e-06,
"logits/chosen": -1.2692714929580688,
"logits/rejected": -1.248396635055542,
"logps/chosen": -549.1704711914062,
"logps/rejected": -643.1365966796875,
"loss": 0.5449,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.976423740386963,
"rewards/margins": 0.864033579826355,
"rewards/rejected": -3.8404572010040283,
"step": 2070
},
{
"epoch": 0.5443601151531012,
"grad_norm": 9.540526390075684,
"learning_rate": 2.5502547870114137e-06,
"logits/chosen": -1.3864257335662842,
"logits/rejected": -1.2374814748764038,
"logps/chosen": -515.8956909179688,
"logps/rejected": -582.565185546875,
"loss": 0.5054,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.6055984497070312,
"rewards/margins": 0.9696043729782104,
"rewards/rejected": -3.5752029418945312,
"step": 2080
},
{
"epoch": 0.5469772310913373,
"grad_norm": 11.426345825195312,
"learning_rate": 2.527412999094507e-06,
"logits/chosen": -1.3803369998931885,
"logits/rejected": -1.2163236141204834,
"logps/chosen": -550.672607421875,
"logps/rejected": -663.0771484375,
"loss": 0.4469,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.5704407691955566,
"rewards/margins": 1.2492586374282837,
"rewards/rejected": -3.81969952583313,
"step": 2090
},
{
"epoch": 0.5495943470295734,
"grad_norm": 10.400588035583496,
"learning_rate": 2.504568922200064e-06,
"logits/chosen": -1.3159221410751343,
"logits/rejected": -1.1753368377685547,
"logps/chosen": -523.4752197265625,
"logps/rejected": -638.3160400390625,
"loss": 0.5068,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.8226757049560547,
"rewards/margins": 1.3093383312225342,
"rewards/rejected": -4.132014274597168,
"step": 2100
},
{
"epoch": 0.5495943470295734,
"eval_logits/chosen": -1.2426347732543945,
"eval_logits/rejected": -1.1150033473968506,
"eval_logps/chosen": -579.1231079101562,
"eval_logps/rejected": -680.8953857421875,
"eval_loss": 0.49687275290489197,
"eval_rewards/accuracies": 0.7350000143051147,
"eval_rewards/chosen": -3.145211935043335,
"eval_rewards/margins": 1.2180382013320923,
"eval_rewards/rejected": -4.363250255584717,
"eval_runtime": 1598.2336,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 2100
},
{
"epoch": 0.5522114629678094,
"grad_norm": 13.339751243591309,
"learning_rate": 2.4817244638019333e-06,
"logits/chosen": -1.3830143213272095,
"logits/rejected": -1.2277696132659912,
"logps/chosen": -579.9473876953125,
"logps/rejected": -659.97412109375,
"loss": 0.5192,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.041943073272705,
"rewards/margins": 1.2124286890029907,
"rewards/rejected": -4.254371643066406,
"step": 2110
},
{
"epoch": 0.5548285789060455,
"grad_norm": 10.70681095123291,
"learning_rate": 2.4588815314058155e-06,
"logits/chosen": -1.3805171251296997,
"logits/rejected": -1.3095520734786987,
"logps/chosen": -527.185302734375,
"logps/rejected": -602.2315673828125,
"loss": 0.4666,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.8171143531799316,
"rewards/margins": 1.0927484035491943,
"rewards/rejected": -3.909862518310547,
"step": 2120
},
{
"epoch": 0.5574456948442816,
"grad_norm": 13.302124977111816,
"learning_rate": 2.4360420323899922e-06,
"logits/chosen": -1.4553115367889404,
"logits/rejected": -1.3466088771820068,
"logps/chosen": -556.26611328125,
"logps/rejected": -608.5457763671875,
"loss": 0.5919,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8050436973571777,
"rewards/margins": 0.8486245274543762,
"rewards/rejected": -3.653668165206909,
"step": 2130
},
{
"epoch": 0.5600628107825176,
"grad_norm": 9.734771728515625,
"learning_rate": 2.4132078738460585e-06,
"logits/chosen": -1.5308691263198853,
"logits/rejected": -1.3970110416412354,
"logps/chosen": -516.7899169921875,
"logps/rejected": -586.8917236328125,
"loss": 0.4794,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.5241827964782715,
"rewards/margins": 1.0863929986953735,
"rewards/rejected": -3.6105754375457764,
"step": 2140
},
{
"epoch": 0.5626799267207537,
"grad_norm": 16.153608322143555,
"learning_rate": 2.3903809624196826e-06,
"logits/chosen": -1.4662330150604248,
"logits/rejected": -1.3525559902191162,
"logps/chosen": -481.68035888671875,
"logps/rejected": -539.03515625,
"loss": 0.5497,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.4799540042877197,
"rewards/margins": 0.8992452621459961,
"rewards/rejected": -3.379199266433716,
"step": 2150
},
{
"epoch": 0.5652970426589898,
"grad_norm": 13.256193161010742,
"learning_rate": 2.3675632041513978e-06,
"logits/chosen": -1.5737719535827637,
"logits/rejected": -1.3470098972320557,
"logps/chosen": -527.2965698242188,
"logps/rejected": -575.4271240234375,
"loss": 0.4882,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.403984546661377,
"rewards/margins": 1.1281957626342773,
"rewards/rejected": -3.532180070877075,
"step": 2160
},
{
"epoch": 0.5679141585972258,
"grad_norm": 13.135820388793945,
"learning_rate": 2.3447565043174533e-06,
"logits/chosen": -1.4236756563186646,
"logits/rejected": -1.263068437576294,
"logps/chosen": -507.3004455566406,
"logps/rejected": -562.6768798828125,
"loss": 0.5195,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.490262746810913,
"rewards/margins": 0.9420592188835144,
"rewards/rejected": -3.4323222637176514,
"step": 2170
},
{
"epoch": 0.5705312745354619,
"grad_norm": 11.776784896850586,
"learning_rate": 2.321962767270724e-06,
"logits/chosen": -1.4430488348007202,
"logits/rejected": -1.3176742792129517,
"logps/chosen": -504.53009033203125,
"logps/rejected": -551.927490234375,
"loss": 0.5579,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.546835422515869,
"rewards/margins": 0.8456700444221497,
"rewards/rejected": -3.392504930496216,
"step": 2180
},
{
"epoch": 0.573148390473698,
"grad_norm": 8.283063888549805,
"learning_rate": 2.299183896281692e-06,
"logits/chosen": -1.4085118770599365,
"logits/rejected": -1.3014566898345947,
"logps/chosen": -492.5560607910156,
"logps/rejected": -586.7115478515625,
"loss": 0.5087,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.376844882965088,
"rewards/margins": 0.9645326733589172,
"rewards/rejected": -3.3413777351379395,
"step": 2190
},
{
"epoch": 0.575765506411934,
"grad_norm": 6.720417022705078,
"learning_rate": 2.2764217933795297e-06,
"logits/chosen": -1.529714822769165,
"logits/rejected": -1.415728211402893,
"logps/chosen": -480.97955322265625,
"logps/rejected": -568.6060791015625,
"loss": 0.4839,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.196598529815674,
"rewards/margins": 1.081301212310791,
"rewards/rejected": -3.2778995037078857,
"step": 2200
},
{
"epoch": 0.575765506411934,
"eval_logits/chosen": -1.3886340856552124,
"eval_logits/rejected": -1.2705532312393188,
"eval_logps/chosen": -502.5681457519531,
"eval_logps/rejected": -588.3314819335938,
"eval_loss": 0.49266016483306885,
"eval_rewards/accuracies": 0.7404999732971191,
"eval_rewards/chosen": -2.379661798477173,
"eval_rewards/margins": 1.0579497814178467,
"eval_rewards/rejected": -3.4376115798950195,
"eval_runtime": 1597.9898,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 2200
},
{
"epoch": 0.5783826223501701,
"grad_norm": 5.171483039855957,
"learning_rate": 2.2536783591932786e-06,
"logits/chosen": -1.5503586530685425,
"logits/rejected": -1.394595742225647,
"logps/chosen": -533.165771484375,
"logps/rejected": -615.0496826171875,
"loss": 0.4996,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.578037738800049,
"rewards/margins": 1.0000585317611694,
"rewards/rejected": -3.5780959129333496,
"step": 2210
},
{
"epoch": 0.5809997382884062,
"grad_norm": 8.2252836227417,
"learning_rate": 2.230955492793149e-06,
"logits/chosen": -1.3457584381103516,
"logits/rejected": -1.2957208156585693,
"logps/chosen": -549.8411865234375,
"logps/rejected": -641.9370727539062,
"loss": 0.5669,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.760831356048584,
"rewards/margins": 1.0501973628997803,
"rewards/rejected": -3.811028242111206,
"step": 2220
},
{
"epoch": 0.5836168542266422,
"grad_norm": 8.77107048034668,
"learning_rate": 2.208255091531947e-06,
"logits/chosen": -1.3714348077774048,
"logits/rejected": -1.294524908065796,
"logps/chosen": -516.0252685546875,
"logps/rejected": -604.4722900390625,
"loss": 0.4741,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.39119291305542,
"rewards/margins": 1.2340974807739258,
"rewards/rejected": -3.6252903938293457,
"step": 2230
},
{
"epoch": 0.5862339701648783,
"grad_norm": 13.218793869018555,
"learning_rate": 2.1855790508866435e-06,
"logits/chosen": -1.4534399509429932,
"logits/rejected": -1.355835199356079,
"logps/chosen": -526.3766479492188,
"logps/rejected": -626.1098022460938,
"loss": 0.5044,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2947020530700684,
"rewards/margins": 1.1731860637664795,
"rewards/rejected": -3.467888355255127,
"step": 2240
},
{
"epoch": 0.5888510861031143,
"grad_norm": 7.438356876373291,
"learning_rate": 2.162929264300107e-06,
"logits/chosen": -1.4775898456573486,
"logits/rejected": -1.375610113143921,
"logps/chosen": -440.20208740234375,
"logps/rejected": -556.2523193359375,
"loss": 0.3962,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.712977647781372,
"rewards/margins": 1.3840254545211792,
"rewards/rejected": -3.0970029830932617,
"step": 2250
},
{
"epoch": 0.5914682020413504,
"grad_norm": 13.876564025878906,
"learning_rate": 2.1403076230230006e-06,
"logits/chosen": -1.4080345630645752,
"logits/rejected": -1.2920736074447632,
"logps/chosen": -475.05548095703125,
"logps/rejected": -532.099365234375,
"loss": 0.5823,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.039393901824951,
"rewards/margins": 0.848638653755188,
"rewards/rejected": -2.8880326747894287,
"step": 2260
},
{
"epoch": 0.5940853179795865,
"grad_norm": 10.589466094970703,
"learning_rate": 2.11771601595586e-06,
"logits/chosen": -1.447265386581421,
"logits/rejected": -1.3221790790557861,
"logps/chosen": -478.97149658203125,
"logps/rejected": -525.7181396484375,
"loss": 0.5126,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.9702937602996826,
"rewards/margins": 0.9898223876953125,
"rewards/rejected": -2.960115909576416,
"step": 2270
},
{
"epoch": 0.5967024339178225,
"grad_norm": 13.875945091247559,
"learning_rate": 2.0951563294913737e-06,
"logits/chosen": -1.3917208909988403,
"logits/rejected": -1.2089694738388062,
"logps/chosen": -480.66595458984375,
"logps/rejected": -556.7701416015625,
"loss": 0.462,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.19204044342041,
"rewards/margins": 1.0662552118301392,
"rewards/rejected": -3.2582955360412598,
"step": 2280
},
{
"epoch": 0.5993195498560586,
"grad_norm": 9.446721076965332,
"learning_rate": 2.0726304473568693e-06,
"logits/chosen": -1.3553434610366821,
"logits/rejected": -1.238797903060913,
"logps/chosen": -493.4701232910156,
"logps/rejected": -565.8324584960938,
"loss": 0.4824,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.3467798233032227,
"rewards/margins": 1.046331524848938,
"rewards/rejected": -3.393110990524292,
"step": 2290
},
{
"epoch": 0.6019366657942947,
"grad_norm": 9.440372467041016,
"learning_rate": 2.050140250457023e-06,
"logits/chosen": -1.4484026432037354,
"logits/rejected": -1.2413018941879272,
"logps/chosen": -548.2637939453125,
"logps/rejected": -634.7371826171875,
"loss": 0.4729,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.7083754539489746,
"rewards/margins": 1.2052128314971924,
"rewards/rejected": -3.913588285446167,
"step": 2300
},
{
"epoch": 0.6019366657942947,
"eval_logits/chosen": -1.2145209312438965,
"eval_logits/rejected": -1.0868196487426758,
"eval_logps/chosen": -549.21240234375,
"eval_logps/rejected": -656.666748046875,
"eval_loss": 0.49239280819892883,
"eval_rewards/accuracies": 0.7404999732971191,
"eval_rewards/chosen": -2.846104383468628,
"eval_rewards/margins": 1.2748597860336304,
"eval_rewards/rejected": -4.120964050292969,
"eval_runtime": 1598.0392,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 2300
},
{
"epoch": 0.6045537817325307,
"grad_norm": 15.882525444030762,
"learning_rate": 2.0276876167168042e-06,
"logits/chosen": -1.1735626459121704,
"logits/rejected": -1.07330322265625,
"logps/chosen": -511.028564453125,
"logps/rejected": -598.4305419921875,
"loss": 0.5852,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.8660171031951904,
"rewards/margins": 1.2002947330474854,
"rewards/rejected": -4.066311836242676,
"step": 2310
},
{
"epoch": 0.6071708976707668,
"grad_norm": 8.326827049255371,
"learning_rate": 2.0052744209246682e-06,
"logits/chosen": -1.3162415027618408,
"logits/rejected": -1.1979725360870361,
"logps/chosen": -522.6014404296875,
"logps/rejected": -590.3125610351562,
"loss": 0.5189,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.673841714859009,
"rewards/margins": 1.005782127380371,
"rewards/rejected": -3.6796233654022217,
"step": 2320
},
{
"epoch": 0.6097880136090029,
"grad_norm": 14.694316864013672,
"learning_rate": 1.9829025345760127e-06,
"logits/chosen": -1.321624994277954,
"logits/rejected": -1.2669956684112549,
"logps/chosen": -527.9610595703125,
"logps/rejected": -604.3615112304688,
"loss": 0.5587,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.490408420562744,
"rewards/margins": 0.8247787356376648,
"rewards/rejected": -3.315187454223633,
"step": 2330
},
{
"epoch": 0.6124051295472389,
"grad_norm": 10.311464309692383,
"learning_rate": 1.9605738257169115e-06,
"logits/chosen": -1.270620346069336,
"logits/rejected": -1.120276927947998,
"logps/chosen": -485.13226318359375,
"logps/rejected": -588.9454956054688,
"loss": 0.4972,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.5224430561065674,
"rewards/margins": 1.1178371906280518,
"rewards/rejected": -3.6402804851531982,
"step": 2340
},
{
"epoch": 0.615022245485475,
"grad_norm": 10.49399471282959,
"learning_rate": 1.9382901587881275e-06,
"logits/chosen": -1.304638147354126,
"logits/rejected": -1.1775546073913574,
"logps/chosen": -500.91339111328125,
"logps/rejected": -595.765625,
"loss": 0.4158,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.42271089553833,
"rewards/margins": 1.271071195602417,
"rewards/rejected": -3.693782329559326,
"step": 2350
},
{
"epoch": 0.6176393614237111,
"grad_norm": 10.7622652053833,
"learning_rate": 1.916053394469437e-06,
"logits/chosen": -1.3254437446594238,
"logits/rejected": -1.1220028400421143,
"logps/chosen": -544.5164184570312,
"logps/rejected": -650.4244384765625,
"loss": 0.5071,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.812866687774658,
"rewards/margins": 1.2050769329071045,
"rewards/rejected": -4.017943382263184,
"step": 2360
},
{
"epoch": 0.6202564773619471,
"grad_norm": 9.485381126403809,
"learning_rate": 1.8938653895242604e-06,
"logits/chosen": -1.2691552639007568,
"logits/rejected": -1.0826399326324463,
"logps/chosen": -547.7079467773438,
"logps/rejected": -651.6534423828125,
"loss": 0.4229,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.8349509239196777,
"rewards/margins": 1.3276488780975342,
"rewards/rejected": -4.162599563598633,
"step": 2370
},
{
"epoch": 0.6228735933001832,
"grad_norm": 13.950716018676758,
"learning_rate": 1.8717279966446267e-06,
"logits/chosen": -1.116172432899475,
"logits/rejected": -1.0301882028579712,
"logps/chosen": -559.43896484375,
"logps/rejected": -674.9713134765625,
"loss": 0.4381,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.109766721725464,
"rewards/margins": 1.2503349781036377,
"rewards/rejected": -4.360101699829102,
"step": 2380
},
{
"epoch": 0.6254907092384192,
"grad_norm": 6.874395370483398,
"learning_rate": 1.8496430642964698e-06,
"logits/chosen": -1.1902521848678589,
"logits/rejected": -1.0676952600479126,
"logps/chosen": -581.3973999023438,
"logps/rejected": -685.8600463867188,
"loss": 0.4878,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.091068983078003,
"rewards/margins": 1.280792474746704,
"rewards/rejected": -4.371861457824707,
"step": 2390
},
{
"epoch": 0.6281078251766553,
"grad_norm": 8.19883918762207,
"learning_rate": 1.827612436565286e-06,
"logits/chosen": -1.2159771919250488,
"logits/rejected": -1.0594358444213867,
"logps/chosen": -552.8912353515625,
"logps/rejected": -668.69921875,
"loss": 0.4501,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.900498390197754,
"rewards/margins": 1.363520622253418,
"rewards/rejected": -4.264018535614014,
"step": 2400
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -1.125669240951538,
"eval_logits/rejected": -0.9978408813476562,
"eval_logps/chosen": -562.0332641601562,
"eval_logps/rejected": -668.2345581054688,
"eval_loss": 0.489955335855484,
"eval_rewards/accuracies": 0.7429999709129333,
"eval_rewards/chosen": -2.974313735961914,
"eval_rewards/margins": 1.2623279094696045,
"eval_rewards/rejected": -4.236640930175781,
"eval_runtime": 1597.7712,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 2400
},
{
"epoch": 0.6307249411148914,
"grad_norm": 17.752099990844727,
"learning_rate": 1.8056379530021492e-06,
"logits/chosen": -1.2881437540054321,
"logits/rejected": -1.1982684135437012,
"logps/chosen": -541.344482421875,
"logps/rejected": -619.2030029296875,
"loss": 0.5127,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.9722681045532227,
"rewards/margins": 1.0265535116195679,
"rewards/rejected": -3.998821258544922,
"step": 2410
},
{
"epoch": 0.6333420570531274,
"grad_norm": 8.487099647521973,
"learning_rate": 1.7837214484701154e-06,
"logits/chosen": -1.339787244796753,
"logits/rejected": -1.2054253816604614,
"logps/chosen": -510.6474609375,
"logps/rejected": -616.0956420898438,
"loss": 0.4786,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.5842533111572266,
"rewards/margins": 1.2878291606903076,
"rewards/rejected": -3.872082233428955,
"step": 2420
},
{
"epoch": 0.6359591729913635,
"grad_norm": 14.060596466064453,
"learning_rate": 1.7618647529910043e-06,
"logits/chosen": -1.3402836322784424,
"logits/rejected": -1.2104113101959229,
"logps/chosen": -508.7401428222656,
"logps/rejected": -625.2578735351562,
"loss": 0.4786,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.520415782928467,
"rewards/margins": 1.2837390899658203,
"rewards/rejected": -3.804154872894287,
"step": 2430
},
{
"epoch": 0.6385762889295996,
"grad_norm": 9.59952449798584,
"learning_rate": 1.7400696915925996e-06,
"logits/chosen": -1.3208459615707397,
"logits/rejected": -1.141103982925415,
"logps/chosen": -542.9054565429688,
"logps/rejected": -597.6497802734375,
"loss": 0.5332,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.7234339714050293,
"rewards/margins": 1.1302305459976196,
"rewards/rejected": -3.853663921356201,
"step": 2440
},
{
"epoch": 0.6411934048678356,
"grad_norm": 12.0162992477417,
"learning_rate": 1.718338084156254e-06,
"logits/chosen": -1.309754490852356,
"logits/rejected": -1.1585099697113037,
"logps/chosen": -527.2728881835938,
"logps/rejected": -612.1050415039062,
"loss": 0.4561,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.4056506156921387,
"rewards/margins": 1.2328300476074219,
"rewards/rejected": -3.6384806632995605,
"step": 2450
},
{
"epoch": 0.6438105208060717,
"grad_norm": 15.278785705566406,
"learning_rate": 1.6966717452649372e-06,
"logits/chosen": -1.446975588798523,
"logits/rejected": -1.282833456993103,
"logps/chosen": -502.6387634277344,
"logps/rejected": -571.8133544921875,
"loss": 0.4497,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2578883171081543,
"rewards/margins": 1.2107127904891968,
"rewards/rejected": -3.4686012268066406,
"step": 2460
},
{
"epoch": 0.6464276367443078,
"grad_norm": 8.051189422607422,
"learning_rate": 1.6750724840517103e-06,
"logits/chosen": -1.3973640203475952,
"logits/rejected": -1.3273109197616577,
"logps/chosen": -461.38665771484375,
"logps/rejected": -567.72509765625,
"loss": 0.5096,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.086717128753662,
"rewards/margins": 1.0224206447601318,
"rewards/rejected": -3.109138011932373,
"step": 2470
},
{
"epoch": 0.6490447526825438,
"grad_norm": 11.723711967468262,
"learning_rate": 1.6535421040486686e-06,
"logits/chosen": -1.2049771547317505,
"logits/rejected": -1.1100887060165405,
"logps/chosen": -472.3949279785156,
"logps/rejected": -566.0884399414062,
"loss": 0.4321,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.191418409347534,
"rewards/margins": 1.2799413204193115,
"rewards/rejected": -3.4713597297668457,
"step": 2480
},
{
"epoch": 0.6516618686207799,
"grad_norm": 18.185462951660156,
"learning_rate": 1.6320824030363458e-06,
"logits/chosen": -1.2619669437408447,
"logits/rejected": -1.2079191207885742,
"logps/chosen": -466.91845703125,
"logps/rejected": -579.8604125976562,
"loss": 0.4609,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.3208956718444824,
"rewards/margins": 1.2844674587249756,
"rewards/rejected": -3.605363130569458,
"step": 2490
},
{
"epoch": 0.654278984559016,
"grad_norm": 14.226390838623047,
"learning_rate": 1.6106951728936028e-06,
"logits/chosen": -1.3449714183807373,
"logits/rejected": -1.212172508239746,
"logps/chosen": -500.9017028808594,
"logps/rejected": -603.9783325195312,
"loss": 0.4982,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.4372124671936035,
"rewards/margins": 1.1106388568878174,
"rewards/rejected": -3.547851085662842,
"step": 2500
},
{
"epoch": 0.654278984559016,
"eval_logits/chosen": -1.1862049102783203,
"eval_logits/rejected": -1.0531891584396362,
"eval_logps/chosen": -510.45111083984375,
"eval_logps/rejected": -612.1486206054688,
"eval_loss": 0.4872073829174042,
"eval_rewards/accuracies": 0.7419999837875366,
"eval_rewards/chosen": -2.4584920406341553,
"eval_rewards/margins": 1.2172898054122925,
"eval_rewards/rejected": -3.675781488418579,
"eval_runtime": 1597.3321,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 2500
},
{
"epoch": 0.656896100497252,
"grad_norm": 10.693954467773438,
"learning_rate": 1.5893821994479996e-06,
"logits/chosen": -1.3317979574203491,
"logits/rejected": -1.2107937335968018,
"logps/chosen": -518.4515380859375,
"logps/rejected": -599.1994018554688,
"loss": 0.479,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.410822629928589,
"rewards/margins": 1.1930694580078125,
"rewards/rejected": -3.6038920879364014,
"step": 2510
},
{
"epoch": 0.6595132164354881,
"grad_norm": 10.660543441772461,
"learning_rate": 1.5681452623266868e-06,
"logits/chosen": -1.3140581846237183,
"logits/rejected": -1.059616208076477,
"logps/chosen": -549.7576293945312,
"logps/rejected": -629.7240600585938,
"loss": 0.4838,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.60799503326416,
"rewards/margins": 1.3822691440582275,
"rewards/rejected": -3.9902641773223877,
"step": 2520
},
{
"epoch": 0.6621303323737242,
"grad_norm": 9.327082633972168,
"learning_rate": 1.5469861348078014e-06,
"logits/chosen": -1.315731167793274,
"logits/rejected": -1.1592333316802979,
"logps/chosen": -516.8220825195312,
"logps/rejected": -651.5704956054688,
"loss": 0.4178,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7009530067443848,
"rewards/margins": 1.4539804458618164,
"rewards/rejected": -4.154933929443359,
"step": 2530
},
{
"epoch": 0.6647474483119602,
"grad_norm": 14.643211364746094,
"learning_rate": 1.5259065836724035e-06,
"logits/chosen": -1.1933174133300781,
"logits/rejected": -1.119011402130127,
"logps/chosen": -520.9530639648438,
"logps/rejected": -660.1638793945312,
"loss": 0.4109,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.762756824493408,
"rewards/margins": 1.4529297351837158,
"rewards/rejected": -4.215685844421387,
"step": 2540
},
{
"epoch": 0.6673645642501963,
"grad_norm": 23.931337356567383,
"learning_rate": 1.5049083690569456e-06,
"logits/chosen": -1.2723230123519897,
"logits/rejected": -1.1774482727050781,
"logps/chosen": -514.2723999023438,
"logps/rejected": -640.9010009765625,
"loss": 0.5081,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.768399477005005,
"rewards/margins": 1.3103950023651123,
"rewards/rejected": -4.078794002532959,
"step": 2550
},
{
"epoch": 0.6699816801884323,
"grad_norm": 10.859006881713867,
"learning_rate": 1.4839932443063057e-06,
"logits/chosen": -1.286709189414978,
"logits/rejected": -1.130676507949829,
"logps/chosen": -553.838134765625,
"logps/rejected": -631.344482421875,
"loss": 0.459,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.619520425796509,
"rewards/margins": 1.3386470079421997,
"rewards/rejected": -3.958167314529419,
"step": 2560
},
{
"epoch": 0.6725987961266684,
"grad_norm": 14.106218338012695,
"learning_rate": 1.4631629558273803e-06,
"logits/chosen": -1.3286449909210205,
"logits/rejected": -1.205742597579956,
"logps/chosen": -493.1719665527344,
"logps/rejected": -586.0244140625,
"loss": 0.5866,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.515252113342285,
"rewards/margins": 1.044698715209961,
"rewards/rejected": -3.559950590133667,
"step": 2570
},
{
"epoch": 0.6752159120649045,
"grad_norm": 6.197361469268799,
"learning_rate": 1.4424192429432657e-06,
"logits/chosen": -1.4131492376327515,
"logits/rejected": -1.3109676837921143,
"logps/chosen": -469.304931640625,
"logps/rejected": -598.4210815429688,
"loss": 0.4614,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.13527250289917,
"rewards/margins": 1.2581441402435303,
"rewards/rejected": -3.3934166431427,
"step": 2580
},
{
"epoch": 0.6778330280031405,
"grad_norm": 13.008953094482422,
"learning_rate": 1.421763837748016e-06,
"logits/chosen": -1.3615756034851074,
"logits/rejected": -1.268027901649475,
"logps/chosen": -481.23358154296875,
"logps/rejected": -605.3997802734375,
"loss": 0.4241,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.3075268268585205,
"rewards/margins": 1.3260997533798218,
"rewards/rejected": -3.6336264610290527,
"step": 2590
},
{
"epoch": 0.6804501439413766,
"grad_norm": 10.1073637008667,
"learning_rate": 1.401198464962021e-06,
"logits/chosen": -1.3610389232635498,
"logits/rejected": -1.198563814163208,
"logps/chosen": -530.4500732421875,
"logps/rejected": -605.023681640625,
"loss": 0.4649,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6202147006988525,
"rewards/margins": 1.1309864521026611,
"rewards/rejected": -3.7512009143829346,
"step": 2600
},
{
"epoch": 0.6804501439413766,
"eval_logits/chosen": -1.2114638090133667,
"eval_logits/rejected": -1.079287052154541,
"eval_logps/chosen": -522.1907958984375,
"eval_logps/rejected": -632.8793334960938,
"eval_loss": 0.48811665177345276,
"eval_rewards/accuracies": 0.7450000047683716,
"eval_rewards/chosen": -2.5758883953094482,
"eval_rewards/margins": 1.307201623916626,
"eval_rewards/rejected": -3.883090019226074,
"eval_runtime": 1597.3459,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 2600
},
{
"epoch": 0.6830672598796127,
"grad_norm": 9.867508888244629,
"learning_rate": 1.3807248417879896e-06,
"logits/chosen": -1.401888132095337,
"logits/rejected": -1.276673436164856,
"logps/chosen": -532.1458740234375,
"logps/rejected": -652.4072265625,
"loss": 0.4363,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.591766357421875,
"rewards/margins": 1.404159426689148,
"rewards/rejected": -3.9959254264831543,
"step": 2610
},
{
"epoch": 0.6856843758178487,
"grad_norm": 28.170438766479492,
"learning_rate": 1.3603446777675665e-06,
"logits/chosen": -1.209559440612793,
"logits/rejected": -1.0783087015151978,
"logps/chosen": -564.7611694335938,
"logps/rejected": -670.5452270507812,
"loss": 0.5678,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.0164246559143066,
"rewards/margins": 1.2787444591522217,
"rewards/rejected": -4.295169353485107,
"step": 2620
},
{
"epoch": 0.6883014917560848,
"grad_norm": 9.69133472442627,
"learning_rate": 1.3400596746385817e-06,
"logits/chosen": -1.3486610651016235,
"logits/rejected": -1.1891324520111084,
"logps/chosen": -538.7862548828125,
"logps/rejected": -633.6087646484375,
"loss": 0.4968,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.6963882446289062,
"rewards/margins": 1.2195419073104858,
"rewards/rejected": -3.9159302711486816,
"step": 2630
},
{
"epoch": 0.6909186076943209,
"grad_norm": 11.340239524841309,
"learning_rate": 1.3198715261929587e-06,
"logits/chosen": -1.339261531829834,
"logits/rejected": -1.186591386795044,
"logps/chosen": -508.19189453125,
"logps/rejected": -621.9693603515625,
"loss": 0.4294,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.717308759689331,
"rewards/margins": 1.2740724086761475,
"rewards/rejected": -3.9913814067840576,
"step": 2640
},
{
"epoch": 0.6935357236325569,
"grad_norm": 8.626651763916016,
"learning_rate": 1.2997819181352823e-06,
"logits/chosen": -1.3360286951065063,
"logits/rejected": -1.1647727489471436,
"logps/chosen": -564.6326904296875,
"logps/rejected": -691.1988525390625,
"loss": 0.4326,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.678508996963501,
"rewards/margins": 1.511307954788208,
"rewards/rejected": -4.189816474914551,
"step": 2650
},
{
"epoch": 0.696152839570793,
"grad_norm": 21.902114868164062,
"learning_rate": 1.2797925279420454e-06,
"logits/chosen": -1.2936923503875732,
"logits/rejected": -1.1569340229034424,
"logps/chosen": -576.9588623046875,
"logps/rejected": -705.35791015625,
"loss": 0.4632,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.0516982078552246,
"rewards/margins": 1.4037137031555176,
"rewards/rejected": -4.4554123878479,
"step": 2660
},
{
"epoch": 0.6987699555090291,
"grad_norm": 12.999093055725098,
"learning_rate": 1.2599050247215764e-06,
"logits/chosen": -1.2631757259368896,
"logits/rejected": -1.1437580585479736,
"logps/chosen": -550.2919921875,
"logps/rejected": -659.2994384765625,
"loss": 0.4911,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.9243297576904297,
"rewards/margins": 1.3196732997894287,
"rewards/rejected": -4.244002819061279,
"step": 2670
},
{
"epoch": 0.7013870714472651,
"grad_norm": 15.106201171875,
"learning_rate": 1.2401210690746705e-06,
"logits/chosen": -1.2861721515655518,
"logits/rejected": -1.1344573497772217,
"logps/chosen": -562.5394287109375,
"logps/rejected": -647.7611083984375,
"loss": 0.5272,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9545741081237793,
"rewards/margins": 1.1845426559448242,
"rewards/rejected": -4.1391167640686035,
"step": 2680
},
{
"epoch": 0.7040041873855012,
"grad_norm": 9.087705612182617,
"learning_rate": 1.2204423129559306e-06,
"logits/chosen": -1.3246266841888428,
"logits/rejected": -1.2674678564071655,
"logps/chosen": -540.13037109375,
"logps/rejected": -657.595947265625,
"loss": 0.492,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.7469382286071777,
"rewards/margins": 1.2162708044052124,
"rewards/rejected": -3.963209629058838,
"step": 2690
},
{
"epoch": 0.7066213033237373,
"grad_norm": 18.654743194580078,
"learning_rate": 1.20087039953583e-06,
"logits/chosen": -1.3834110498428345,
"logits/rejected": -1.2643133401870728,
"logps/chosen": -513.6646728515625,
"logps/rejected": -604.9307250976562,
"loss": 0.556,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.4955801963806152,
"rewards/margins": 1.1764256954193115,
"rewards/rejected": -3.6720058917999268,
"step": 2700
},
{
"epoch": 0.7066213033237373,
"eval_logits/chosen": -1.2295472621917725,
"eval_logits/rejected": -1.1003633737564087,
"eval_logps/chosen": -498.9264831542969,
"eval_logps/rejected": -595.6959228515625,
"eval_loss": 0.4841165840625763,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -2.343245029449463,
"eval_rewards/margins": 1.1680108308792114,
"eval_rewards/rejected": -3.5112557411193848,
"eval_runtime": 1597.0057,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 2700
},
{
"epoch": 0.7092384192619733,
"grad_norm": 19.703975677490234,
"learning_rate": 1.181406963063507e-06,
"logits/chosen": -1.3118432760238647,
"logits/rejected": -1.2396559715270996,
"logps/chosen": -480.10662841796875,
"logps/rejected": -590.1580200195312,
"loss": 0.4967,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.1435210704803467,
"rewards/margins": 1.1070889234542847,
"rewards/rejected": -3.250609874725342,
"step": 2710
},
{
"epoch": 0.7118555352002094,
"grad_norm": 5.974539279937744,
"learning_rate": 1.1620536287303052e-06,
"logits/chosen": -1.4100855588912964,
"logits/rejected": -1.2877388000488281,
"logps/chosen": -500.7962951660156,
"logps/rejected": -562.1715087890625,
"loss": 0.5414,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.097153902053833,
"rewards/margins": 0.9155322909355164,
"rewards/rejected": -3.012686014175415,
"step": 2720
},
{
"epoch": 0.7144726511384454,
"grad_norm": 9.851229667663574,
"learning_rate": 1.1428120125340717e-06,
"logits/chosen": -1.3525193929672241,
"logits/rejected": -1.19950532913208,
"logps/chosen": -445.181884765625,
"logps/rejected": -556.6793212890625,
"loss": 0.3931,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.911560297012329,
"rewards/margins": 1.5294151306152344,
"rewards/rejected": -3.4409751892089844,
"step": 2730
},
{
"epoch": 0.7170897670766815,
"grad_norm": 9.149617195129395,
"learning_rate": 1.123683721144223e-06,
"logits/chosen": -1.3333719968795776,
"logits/rejected": -1.2258195877075195,
"logps/chosen": -494.1949157714844,
"logps/rejected": -600.24658203125,
"loss": 0.4166,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.106128692626953,
"rewards/margins": 1.385012149810791,
"rewards/rejected": -3.491140842437744,
"step": 2740
},
{
"epoch": 0.7197068830149176,
"grad_norm": 7.582621097564697,
"learning_rate": 1.1046703517675848e-06,
"logits/chosen": -1.3422666788101196,
"logits/rejected": -1.2579666376113892,
"logps/chosen": -470.9822692871094,
"logps/rejected": -587.7799072265625,
"loss": 0.5163,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.159165620803833,
"rewards/margins": 1.106806993484497,
"rewards/rejected": -3.26597261428833,
"step": 2750
},
{
"epoch": 0.7223239989531536,
"grad_norm": 12.026503562927246,
"learning_rate": 1.085773492015028e-06,
"logits/chosen": -1.3507201671600342,
"logits/rejected": -1.1788911819458008,
"logps/chosen": -452.8814392089844,
"logps/rejected": -557.76416015625,
"loss": 0.4254,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.065864324569702,
"rewards/margins": 1.3924720287322998,
"rewards/rejected": -3.458336353302002,
"step": 2760
},
{
"epoch": 0.7249411148913897,
"grad_norm": 16.423751831054688,
"learning_rate": 1.0669947197689034e-06,
"logits/chosen": -1.3033778667449951,
"logits/rejected": -1.1522514820098877,
"logps/chosen": -498.0997619628906,
"logps/rejected": -590.888427734375,
"loss": 0.4935,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.2554616928100586,
"rewards/margins": 1.2149760723114014,
"rewards/rejected": -3.470437526702881,
"step": 2770
},
{
"epoch": 0.7275582308296258,
"grad_norm": 8.218092918395996,
"learning_rate": 1.048335603051291e-06,
"logits/chosen": -1.2994598150253296,
"logits/rejected": -1.1608821153640747,
"logps/chosen": -527.0745849609375,
"logps/rejected": -638.7930908203125,
"loss": 0.4518,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3942160606384277,
"rewards/margins": 1.3985862731933594,
"rewards/rejected": -3.792802333831787,
"step": 2780
},
{
"epoch": 0.7301753467678618,
"grad_norm": 12.542786598205566,
"learning_rate": 1.0297976998930665e-06,
"logits/chosen": -1.2890033721923828,
"logits/rejected": -1.1566731929779053,
"logps/chosen": -490.1463928222656,
"logps/rejected": -611.6651611328125,
"loss": 0.4472,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.296391010284424,
"rewards/margins": 1.492492437362671,
"rewards/rejected": -3.7888832092285156,
"step": 2790
},
{
"epoch": 0.7327924627060979,
"grad_norm": 10.247793197631836,
"learning_rate": 1.0113825582038078e-06,
"logits/chosen": -1.329484224319458,
"logits/rejected": -1.2037220001220703,
"logps/chosen": -503.18951416015625,
"logps/rejected": -605.0428466796875,
"loss": 0.4617,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.380659580230713,
"rewards/margins": 1.1856248378753662,
"rewards/rejected": -3.5662841796875,
"step": 2800
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -1.195982813835144,
"eval_logits/rejected": -1.0626633167266846,
"eval_logps/chosen": -499.5495910644531,
"eval_logps/rejected": -606.4032592773438,
"eval_loss": 0.4832090735435486,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -2.3494763374328613,
"eval_rewards/margins": 1.2688524723052979,
"eval_rewards/rejected": -3.61832857131958,
"eval_runtime": 1596.6931,
"eval_samples_per_second": 1.253,
"eval_steps_per_second": 0.157,
"step": 2800
},
{
"epoch": 0.735409578644334,
"grad_norm": 20.173498153686523,
"learning_rate": 9.930917156425477e-07,
"logits/chosen": -1.2981245517730713,
"logits/rejected": -1.1898633241653442,
"logps/chosen": -514.4564819335938,
"logps/rejected": -633.3460693359375,
"loss": 0.5311,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5339112281799316,
"rewards/margins": 1.2178757190704346,
"rewards/rejected": -3.751786708831787,
"step": 2810
},
{
"epoch": 0.73802669458257,
"grad_norm": 20.3377742767334,
"learning_rate": 9.749266994893756e-07,
"logits/chosen": -1.2338041067123413,
"logits/rejected": -1.084707498550415,
"logps/chosen": -491.3687438964844,
"logps/rejected": -575.6991577148438,
"loss": 0.5842,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5079922676086426,
"rewards/margins": 0.9769840240478516,
"rewards/rejected": -3.484976291656494,
"step": 2820
},
{
"epoch": 0.7406438105208061,
"grad_norm": 10.66334056854248,
"learning_rate": 9.56889026517913e-07,
"logits/chosen": -1.2951042652130127,
"logits/rejected": -1.1750590801239014,
"logps/chosen": -511.2039489746094,
"logps/rejected": -602.0439453125,
"loss": 0.4889,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.499505043029785,
"rewards/margins": 1.1682642698287964,
"rewards/rejected": -3.66776967048645,
"step": 2830
},
{
"epoch": 0.7432609264590422,
"grad_norm": 8.688248634338379,
"learning_rate": 9.389802028686617e-07,
"logits/chosen": -1.371790885925293,
"logits/rejected": -1.270684003829956,
"logps/chosen": -503.0506896972656,
"logps/rejected": -553.471923828125,
"loss": 0.5805,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.423825979232788,
"rewards/margins": 0.8341992497444153,
"rewards/rejected": -3.2580254077911377,
"step": 2840
},
{
"epoch": 0.7458780423972782,
"grad_norm": 13.514219284057617,
"learning_rate": 9.212017239232427e-07,
"logits/chosen": -1.2997102737426758,
"logits/rejected": -1.1494895219802856,
"logps/chosen": -497.5753479003906,
"logps/rejected": -604.2395629882812,
"loss": 0.4702,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.2467424869537354,
"rewards/margins": 1.2961435317993164,
"rewards/rejected": -3.542886257171631,
"step": 2850
},
{
"epoch": 0.7484951583355143,
"grad_norm": 10.475621223449707,
"learning_rate": 9.03555074179533e-07,
"logits/chosen": -1.2712374925613403,
"logits/rejected": -1.236365795135498,
"logps/chosen": -477.6539611816406,
"logps/rejected": -613.723876953125,
"loss": 0.4366,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.210069179534912,
"rewards/margins": 1.3023548126220703,
"rewards/rejected": -3.5124244689941406,
"step": 2860
},
{
"epoch": 0.7511122742737504,
"grad_norm": 11.980389595031738,
"learning_rate": 8.860417271277067e-07,
"logits/chosen": -1.3676173686981201,
"logits/rejected": -1.303006887435913,
"logps/chosen": -500.90008544921875,
"logps/rejected": -591.3121948242188,
"loss": 0.5046,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3170719146728516,
"rewards/margins": 0.9840759038925171,
"rewards/rejected": -3.3011481761932373,
"step": 2870
},
{
"epoch": 0.7537293902119864,
"grad_norm": 8.44219970703125,
"learning_rate": 8.686631451272029e-07,
"logits/chosen": -1.323055624961853,
"logits/rejected": -1.1673837900161743,
"logps/chosen": -512.1810302734375,
"logps/rejected": -617.2755126953125,
"loss": 0.4958,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.588519811630249,
"rewards/margins": 1.3053323030471802,
"rewards/rejected": -3.8938522338867188,
"step": 2880
},
{
"epoch": 0.7563465061502225,
"grad_norm": 6.950828552246094,
"learning_rate": 8.514207792846168e-07,
"logits/chosen": -1.3387699127197266,
"logits/rejected": -1.2082509994506836,
"logps/chosen": -511.2001953125,
"logps/rejected": -603.291748046875,
"loss": 0.4873,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.6037611961364746,
"rewards/margins": 1.2057433128356934,
"rewards/rejected": -3.809504747390747,
"step": 2890
},
{
"epoch": 0.7589636220884585,
"grad_norm": 7.15659236907959,
"learning_rate": 8.343160693325356e-07,
"logits/chosen": -1.2071359157562256,
"logits/rejected": -1.0866135358810425,
"logps/chosen": -522.898193359375,
"logps/rejected": -642.6585083007812,
"loss": 0.4916,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6741576194763184,
"rewards/margins": 1.2413873672485352,
"rewards/rejected": -3.9155445098876953,
"step": 2900
},
{
"epoch": 0.7589636220884585,
"eval_logits/chosen": -1.1417629718780518,
"eval_logits/rejected": -1.0031887292861938,
"eval_logps/chosen": -531.7142333984375,
"eval_logps/rejected": -636.2195434570312,
"eval_loss": 0.4799574017524719,
"eval_rewards/accuracies": 0.7455000281333923,
"eval_rewards/chosen": -2.6711227893829346,
"eval_rewards/margins": 1.2453694343566895,
"eval_rewards/rejected": -3.916492223739624,
"eval_runtime": 1597.0819,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 2900
},
{
"epoch": 0.7615807380266946,
"grad_norm": 8.567606925964355,
"learning_rate": 8.173504435093174e-07,
"logits/chosen": -1.228169560432434,
"logits/rejected": -1.035468339920044,
"logps/chosen": -499.9391174316406,
"logps/rejected": -596.357666015625,
"loss": 0.483,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.584282398223877,
"rewards/margins": 1.2857377529144287,
"rewards/rejected": -3.8700199127197266,
"step": 2910
},
{
"epoch": 0.7641978539649307,
"grad_norm": 8.268507957458496,
"learning_rate": 8.00525318439836e-07,
"logits/chosen": -1.2574290037155151,
"logits/rejected": -1.1231721639633179,
"logps/chosen": -538.8193969726562,
"logps/rejected": -632.8656005859375,
"loss": 0.5454,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.62135648727417,
"rewards/margins": 1.0157769918441772,
"rewards/rejected": -3.637133836746216,
"step": 2920
},
{
"epoch": 0.7668149699031667,
"grad_norm": 6.73305606842041,
"learning_rate": 7.838420990171927e-07,
"logits/chosen": -1.3410282135009766,
"logits/rejected": -1.1771111488342285,
"logps/chosen": -512.6897583007812,
"logps/rejected": -595.9611206054688,
"loss": 0.5007,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4339559078216553,
"rewards/margins": 1.0879731178283691,
"rewards/rejected": -3.5219292640686035,
"step": 2930
},
{
"epoch": 0.7694320858414028,
"grad_norm": 8.820892333984375,
"learning_rate": 7.673021782854084e-07,
"logits/chosen": -1.195052981376648,
"logits/rejected": -1.0439643859863281,
"logps/chosen": -517.55810546875,
"logps/rejected": -611.39111328125,
"loss": 0.4715,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5697178840637207,
"rewards/margins": 1.3570950031280518,
"rewards/rejected": -3.9268131256103516,
"step": 2940
},
{
"epoch": 0.7720492017796389,
"grad_norm": 10.797639846801758,
"learning_rate": 7.509069373231039e-07,
"logits/chosen": -1.2065623998641968,
"logits/rejected": -1.0761216878890991,
"logps/chosen": -521.6849365234375,
"logps/rejected": -586.7667236328125,
"loss": 0.5756,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.6908392906188965,
"rewards/margins": 0.898908257484436,
"rewards/rejected": -3.589747667312622,
"step": 2950
},
{
"epoch": 0.7746663177178749,
"grad_norm": 9.844982147216797,
"learning_rate": 7.346577451281822e-07,
"logits/chosen": -1.219063639640808,
"logits/rejected": -1.133622407913208,
"logps/chosen": -516.6390991210938,
"logps/rejected": -624.2025146484375,
"loss": 0.4668,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.544440269470215,
"rewards/margins": 1.317181944847107,
"rewards/rejected": -3.8616223335266113,
"step": 2960
},
{
"epoch": 0.777283433656111,
"grad_norm": 12.88412094116211,
"learning_rate": 7.185559585035138e-07,
"logits/chosen": -1.269226312637329,
"logits/rejected": -1.0988438129425049,
"logps/chosen": -547.8538208007812,
"logps/rejected": -646.4948120117188,
"loss": 0.4824,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.6517109870910645,
"rewards/margins": 1.145399570465088,
"rewards/rejected": -3.7971103191375732,
"step": 2970
},
{
"epoch": 0.7799005495943471,
"grad_norm": 8.924067497253418,
"learning_rate": 7.026029219436504e-07,
"logits/chosen": -1.2982590198516846,
"logits/rejected": -1.1203540563583374,
"logps/chosen": -504.6836853027344,
"logps/rejected": -623.8721313476562,
"loss": 0.4545,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5236408710479736,
"rewards/margins": 1.3211889266967773,
"rewards/rejected": -3.844829559326172,
"step": 2980
},
{
"epoch": 0.7825176655325831,
"grad_norm": 6.5459513664245605,
"learning_rate": 6.867999675225523e-07,
"logits/chosen": -1.3317945003509521,
"logits/rejected": -1.1817572116851807,
"logps/chosen": -483.4960021972656,
"logps/rejected": -588.3883056640625,
"loss": 0.4648,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.526824712753296,
"rewards/margins": 1.1909806728363037,
"rewards/rejected": -3.7178053855895996,
"step": 2990
},
{
"epoch": 0.7851347814708192,
"grad_norm": 8.477150917053223,
"learning_rate": 6.711484147823663e-07,
"logits/chosen": -1.2142189741134644,
"logits/rejected": -1.1248340606689453,
"logps/chosen": -488.151123046875,
"logps/rejected": -618.1202392578125,
"loss": 0.4708,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5399701595306396,
"rewards/margins": 1.251422643661499,
"rewards/rejected": -3.7913928031921387,
"step": 3000
},
{
"epoch": 0.7851347814708192,
"eval_logits/chosen": -1.1355363130569458,
"eval_logits/rejected": -0.9962058663368225,
"eval_logps/chosen": -526.2620849609375,
"eval_logps/rejected": -623.4008178710938,
"eval_loss": 0.4796713590621948,
"eval_rewards/accuracies": 0.7475000023841858,
"eval_rewards/chosen": -2.61660099029541,
"eval_rewards/margins": 1.1717036962509155,
"eval_rewards/rejected": -3.788304328918457,
"eval_runtime": 1596.8532,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 3000
},
{
"epoch": 0.7877518974090553,
"grad_norm": 9.99060344696045,
"learning_rate": 6.556495706232413e-07,
"logits/chosen": -1.2248764038085938,
"logits/rejected": -1.1362669467926025,
"logps/chosen": -536.1866455078125,
"logps/rejected": -627.3114013671875,
"loss": 0.532,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.6758596897125244,
"rewards/margins": 1.0958218574523926,
"rewards/rejected": -3.771681308746338,
"step": 3010
},
{
"epoch": 0.7903690133472913,
"grad_norm": 10.53675365447998,
"learning_rate": 6.403047291942057e-07,
"logits/chosen": -1.1557037830352783,
"logits/rejected": -0.9910370111465454,
"logps/chosen": -491.5957946777344,
"logps/rejected": -585.2208251953125,
"loss": 0.4845,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.6533892154693604,
"rewards/margins": 1.1808980703353882,
"rewards/rejected": -3.834287166595459,
"step": 3020
},
{
"epoch": 0.7929861292855274,
"grad_norm": 12.331343650817871,
"learning_rate": 6.251151717851023e-07,
"logits/chosen": -1.2338765859603882,
"logits/rejected": -1.136103630065918,
"logps/chosen": -477.06243896484375,
"logps/rejected": -573.8302001953125,
"loss": 0.4934,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4373462200164795,
"rewards/margins": 1.100510835647583,
"rewards/rejected": -3.5378570556640625,
"step": 3030
},
{
"epoch": 0.7956032452237635,
"grad_norm": 6.849348545074463,
"learning_rate": 6.100821667196041e-07,
"logits/chosen": -1.4195866584777832,
"logits/rejected": -1.1471283435821533,
"logps/chosen": -517.2559814453125,
"logps/rejected": -562.2097778320312,
"loss": 0.4849,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.4172825813293457,
"rewards/margins": 1.1046142578125,
"rewards/rejected": -3.5218968391418457,
"step": 3040
},
{
"epoch": 0.7982203611619995,
"grad_norm": 7.553493022918701,
"learning_rate": 5.952069692493062e-07,
"logits/chosen": -1.2353808879852295,
"logits/rejected": -1.117495059967041,
"logps/chosen": -466.36297607421875,
"logps/rejected": -601.0093994140625,
"loss": 0.4162,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.381345272064209,
"rewards/margins": 1.3122522830963135,
"rewards/rejected": -3.6935970783233643,
"step": 3050
},
{
"epoch": 0.8008374771002356,
"grad_norm": 9.807490348815918,
"learning_rate": 5.80490821448918e-07,
"logits/chosen": -1.1880605220794678,
"logits/rejected": -1.173678994178772,
"logps/chosen": -516.3980102539062,
"logps/rejected": -701.3677978515625,
"loss": 0.4261,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.5393073558807373,
"rewards/margins": 1.4251797199249268,
"rewards/rejected": -3.9644875526428223,
"step": 3060
},
{
"epoch": 0.8034545930384716,
"grad_norm": 8.029556274414062,
"learning_rate": 5.659349521125459e-07,
"logits/chosen": -1.3551298379898071,
"logits/rejected": -1.2928274869918823,
"logps/chosen": -548.3309326171875,
"logps/rejected": -636.1687622070312,
"loss": 0.5164,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.670652389526367,
"rewards/margins": 1.083081603050232,
"rewards/rejected": -3.7537341117858887,
"step": 3070
},
{
"epoch": 0.8060717089767077,
"grad_norm": 10.774998664855957,
"learning_rate": 5.5154057665109e-07,
"logits/chosen": -1.3124372959136963,
"logits/rejected": -1.1519577503204346,
"logps/chosen": -552.2019653320312,
"logps/rejected": -666.1383666992188,
"loss": 0.4952,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.9138247966766357,
"rewards/margins": 1.403515338897705,
"rewards/rejected": -4.31734037399292,
"step": 3080
},
{
"epoch": 0.8086888249149438,
"grad_norm": 6.534247398376465,
"learning_rate": 5.373088969907586e-07,
"logits/chosen": -1.3402431011199951,
"logits/rejected": -1.147871732711792,
"logps/chosen": -560.8900756835938,
"logps/rejected": -637.8880004882812,
"loss": 0.4401,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.8420605659484863,
"rewards/margins": 1.2714219093322754,
"rewards/rejected": -4.113482475280762,
"step": 3090
},
{
"epoch": 0.8113059408531798,
"grad_norm": 7.741410732269287,
"learning_rate": 5.23241101472709e-07,
"logits/chosen": -1.2486029863357544,
"logits/rejected": -1.1143968105316162,
"logps/chosen": -550.2741088867188,
"logps/rejected": -650.1898803710938,
"loss": 0.4804,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.740609645843506,
"rewards/margins": 1.2194688320159912,
"rewards/rejected": -3.960078001022339,
"step": 3100
},
{
"epoch": 0.8113059408531798,
"eval_logits/chosen": -1.1341181993484497,
"eval_logits/rejected": -0.9953464865684509,
"eval_logps/chosen": -546.843505859375,
"eval_logps/rejected": -656.7727661132812,
"eval_loss": 0.4807169735431671,
"eval_rewards/accuracies": 0.7475000023841858,
"eval_rewards/chosen": -2.822416067123413,
"eval_rewards/margins": 1.2996082305908203,
"eval_rewards/rejected": -4.122024059295654,
"eval_runtime": 1596.5373,
"eval_samples_per_second": 1.253,
"eval_steps_per_second": 0.157,
"step": 3100
},
{
"epoch": 0.8139230567914159,
"grad_norm": 12.117105484008789,
"learning_rate": 5.09338364753818e-07,
"logits/chosen": -1.3339240550994873,
"logits/rejected": -1.1453027725219727,
"logps/chosen": -560.3875122070312,
"logps/rejected": -672.76513671875,
"loss": 0.5239,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.750267267227173,
"rewards/margins": 1.282500982284546,
"rewards/rejected": -4.032768249511719,
"step": 3110
},
{
"epoch": 0.816540172729652,
"grad_norm": 9.4435396194458,
"learning_rate": 4.956018477086005e-07,
"logits/chosen": -1.2970499992370605,
"logits/rejected": -1.1291049718856812,
"logps/chosen": -554.2510986328125,
"logps/rejected": -649.1993408203125,
"loss": 0.5269,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.851862668991089,
"rewards/margins": 1.2229183912277222,
"rewards/rejected": -4.07478141784668,
"step": 3120
},
{
"epoch": 0.819157288667888,
"grad_norm": 14.918078422546387,
"learning_rate": 4.820326973322764e-07,
"logits/chosen": -1.207421064376831,
"logits/rejected": -1.115781307220459,
"logps/chosen": -536.1851806640625,
"logps/rejected": -655.1953735351562,
"loss": 0.5314,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8535518646240234,
"rewards/margins": 1.2501590251922607,
"rewards/rejected": -4.103711128234863,
"step": 3130
},
{
"epoch": 0.821774404606124,
"grad_norm": 14.224996566772461,
"learning_rate": 4.686320466449981e-07,
"logits/chosen": -1.1797192096710205,
"logits/rejected": -0.9966877102851868,
"logps/chosen": -504.07745361328125,
"logps/rejected": -650.1423950195312,
"loss": 0.4351,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.6490511894226074,
"rewards/margins": 1.4823657274246216,
"rewards/rejected": -4.131417274475098,
"step": 3140
},
{
"epoch": 0.8243915205443602,
"grad_norm": 7.642600059509277,
"learning_rate": 4.554010145972418e-07,
"logits/chosen": -1.366541862487793,
"logits/rejected": -1.1694148778915405,
"logps/chosen": -543.9080200195312,
"logps/rejected": -658.1217041015625,
"loss": 0.5434,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.8025176525115967,
"rewards/margins": 1.2542707920074463,
"rewards/rejected": -4.056788444519043,
"step": 3150
},
{
"epoch": 0.8270086364825961,
"grad_norm": 10.804971694946289,
"learning_rate": 4.4234070597637455e-07,
"logits/chosen": -1.1909892559051514,
"logits/rejected": -1.0947834253311157,
"logps/chosen": -546.0935668945312,
"logps/rejected": -647.5870361328125,
"loss": 0.5018,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.70656156539917,
"rewards/margins": 1.1345676183700562,
"rewards/rejected": -3.8411293029785156,
"step": 3160
},
{
"epoch": 0.8296257524208323,
"grad_norm": 6.038959980010986,
"learning_rate": 4.2945221131440783e-07,
"logits/chosen": -1.1933271884918213,
"logits/rejected": -0.9797855615615845,
"logps/chosen": -524.9500732421875,
"logps/rejected": -634.7277221679688,
"loss": 0.4177,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.5928843021392822,
"rewards/margins": 1.4146864414215088,
"rewards/rejected": -4.007569789886475,
"step": 3170
},
{
"epoch": 0.8322428683590684,
"grad_norm": 8.643757820129395,
"learning_rate": 4.167366067969381e-07,
"logits/chosen": -1.3083521127700806,
"logits/rejected": -1.2275665998458862,
"logps/chosen": -480.93267822265625,
"logps/rejected": -616.6661376953125,
"loss": 0.4995,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.5809030532836914,
"rewards/margins": 1.1199270486831665,
"rewards/rejected": -3.7008299827575684,
"step": 3180
},
{
"epoch": 0.8348599842973043,
"grad_norm": 5.735963821411133,
"learning_rate": 4.041949541732826e-07,
"logits/chosen": -1.306793451309204,
"logits/rejected": -1.2211034297943115,
"logps/chosen": -528.7879638671875,
"logps/rejected": -633.7890625,
"loss": 0.5016,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.649538040161133,
"rewards/margins": 1.1934170722961426,
"rewards/rejected": -3.8429553508758545,
"step": 3190
},
{
"epoch": 0.8374771002355405,
"grad_norm": 11.755797386169434,
"learning_rate": 3.9182830066782614e-07,
"logits/chosen": -1.2019577026367188,
"logits/rejected": -1.1839076280593872,
"logps/chosen": -523.3551025390625,
"logps/rejected": -663.1622314453125,
"loss": 0.4866,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.657108783721924,
"rewards/margins": 1.273402452468872,
"rewards/rejected": -3.930511474609375,
"step": 3200
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -1.1640751361846924,
"eval_logits/rejected": -1.0276466608047485,
"eval_logps/chosen": -519.5614013671875,
"eval_logps/rejected": -623.51025390625,
"eval_loss": 0.4776689112186432,
"eval_rewards/accuracies": 0.7475000023841858,
"eval_rewards/chosen": -2.5495944023132324,
"eval_rewards/margins": 1.2398039102554321,
"eval_rewards/rejected": -3.789398431777954,
"eval_runtime": 1596.1554,
"eval_samples_per_second": 1.253,
"eval_steps_per_second": 0.157,
"step": 3200
},
{
"epoch": 0.8400942161737766,
"grad_norm": 6.5980448722839355,
"learning_rate": 3.796376788925771e-07,
"logits/chosen": -1.2055485248565674,
"logits/rejected": -1.1355664730072021,
"logps/chosen": -506.8805236816406,
"logps/rejected": -596.1102294921875,
"loss": 0.503,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.4550302028656006,
"rewards/margins": 1.1328961849212646,
"rewards/rejected": -3.5879263877868652,
"step": 3210
},
{
"epoch": 0.8427113321120125,
"grad_norm": 7.025763034820557,
"learning_rate": 3.676241067609465e-07,
"logits/chosen": -1.2890058755874634,
"logits/rejected": -1.1607505083084106,
"logps/chosen": -542.5242919921875,
"logps/rejected": -612.6026611328125,
"loss": 0.5185,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.488643169403076,
"rewards/margins": 1.1322394609451294,
"rewards/rejected": -3.620882749557495,
"step": 3220
},
{
"epoch": 0.8453284480502486,
"grad_norm": 11.517923355102539,
"learning_rate": 3.5578858740274976e-07,
"logits/chosen": -1.2237865924835205,
"logits/rejected": -1.122482419013977,
"logps/chosen": -517.73974609375,
"logps/rejected": -605.0042114257812,
"loss": 0.513,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.5605525970458984,
"rewards/margins": 0.999243438243866,
"rewards/rejected": -3.5597965717315674,
"step": 3230
},
{
"epoch": 0.8479455639884846,
"grad_norm": 10.88183307647705,
"learning_rate": 3.44132109080447e-07,
"logits/chosen": -1.4292676448822021,
"logits/rejected": -1.2410900592803955,
"logps/chosen": -495.89813232421875,
"logps/rejected": -589.0921630859375,
"loss": 0.4351,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3068690299987793,
"rewards/margins": 1.3076789379119873,
"rewards/rejected": -3.6145482063293457,
"step": 3240
},
{
"epoch": 0.8505626799267207,
"grad_norm": 11.31247615814209,
"learning_rate": 3.3265564510662344e-07,
"logits/chosen": -1.3701411485671997,
"logits/rejected": -1.213030457496643,
"logps/chosen": -523.7352294921875,
"logps/rejected": -627.9140625,
"loss": 0.433,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.305351495742798,
"rewards/margins": 1.2927907705307007,
"rewards/rejected": -3.598142147064209,
"step": 3250
},
{
"epoch": 0.8531797958649568,
"grad_norm": 15.092782974243164,
"learning_rate": 3.213601537627195e-07,
"logits/chosen": -1.2755637168884277,
"logits/rejected": -1.1552997827529907,
"logps/chosen": -517.78369140625,
"logps/rejected": -615.5398559570312,
"loss": 0.5176,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.607300281524658,
"rewards/margins": 1.1482493877410889,
"rewards/rejected": -3.755549669265747,
"step": 3260
},
{
"epoch": 0.8557969118031928,
"grad_norm": 18.888704299926758,
"learning_rate": 3.1024657821901063e-07,
"logits/chosen": -1.3271687030792236,
"logits/rejected": -1.2444849014282227,
"logps/chosen": -477.8196716308594,
"logps/rejected": -587.2836303710938,
"loss": 0.4719,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.301666498184204,
"rewards/margins": 1.2528338432312012,
"rewards/rejected": -3.554500102996826,
"step": 3270
},
{
"epoch": 0.8584140277414289,
"grad_norm": 10.203365325927734,
"learning_rate": 2.9931584645585654e-07,
"logits/chosen": -1.245596170425415,
"logits/rejected": -1.222507357597351,
"logps/chosen": -513.1766357421875,
"logps/rejected": -632.6079711914062,
"loss": 0.4915,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.386977434158325,
"rewards/margins": 1.1650269031524658,
"rewards/rejected": -3.55200457572937,
"step": 3280
},
{
"epoch": 0.861031143679665,
"grad_norm": 4.975100040435791,
"learning_rate": 2.885688711862136e-07,
"logits/chosen": -1.2813748121261597,
"logits/rejected": -1.2710316181182861,
"logps/chosen": -519.4554443359375,
"logps/rejected": -658.6768188476562,
"loss": 0.498,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.585131883621216,
"rewards/margins": 1.4114031791687012,
"rewards/rejected": -3.996534824371338,
"step": 3290
},
{
"epoch": 0.863648259617901,
"grad_norm": 6.192923069000244,
"learning_rate": 2.7800654977942486e-07,
"logits/chosen": -1.268638014793396,
"logits/rejected": -1.1421396732330322,
"logps/chosen": -508.80584716796875,
"logps/rejected": -629.664306640625,
"loss": 0.4967,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.516002893447876,
"rewards/margins": 1.2464697360992432,
"rewards/rejected": -3.762472629547119,
"step": 3300
},
{
"epoch": 0.863648259617901,
"eval_logits/chosen": -1.1608073711395264,
"eval_logits/rejected": -1.0241122245788574,
"eval_logps/chosen": -520.3804321289062,
"eval_logps/rejected": -625.6535034179688,
"eval_loss": 0.47857987880706787,
"eval_rewards/accuracies": 0.7480000257492065,
"eval_rewards/chosen": -2.5577852725982666,
"eval_rewards/margins": 1.2530462741851807,
"eval_rewards/rejected": -3.8108315467834473,
"eval_runtime": 1596.9029,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 3300
},
{
"epoch": 0.8662653755561371,
"grad_norm": 14.262938499450684,
"learning_rate": 2.6762976418628797e-07,
"logits/chosen": -1.2763694524765015,
"logits/rejected": -1.12501859664917,
"logps/chosen": -479.6934509277344,
"logps/rejected": -549.134765625,
"loss": 0.5232,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.4996352195739746,
"rewards/margins": 1.1307785511016846,
"rewards/rejected": -3.630413770675659,
"step": 3310
},
{
"epoch": 0.8688824914943732,
"grad_norm": 6.411558628082275,
"learning_rate": 2.5743938086541354e-07,
"logits/chosen": -1.277998447418213,
"logits/rejected": -1.1551088094711304,
"logps/chosen": -519.344482421875,
"logps/rejected": -614.8687744140625,
"loss": 0.5019,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.5514492988586426,
"rewards/margins": 1.2041881084442139,
"rewards/rejected": -3.7556369304656982,
"step": 3320
},
{
"epoch": 0.8714996074326092,
"grad_norm": 8.243268966674805,
"learning_rate": 2.4743625071087574e-07,
"logits/chosen": -1.444610834121704,
"logits/rejected": -1.2527801990509033,
"logps/chosen": -515.1294555664062,
"logps/rejected": -632.4005737304688,
"loss": 0.4532,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.376555919647217,
"rewards/margins": 1.4679479598999023,
"rewards/rejected": -3.844503879547119,
"step": 3330
},
{
"epoch": 0.8741167233708453,
"grad_norm": 10.0430908203125,
"learning_rate": 2.3762120898116498e-07,
"logits/chosen": -1.2998971939086914,
"logits/rejected": -1.1857765913009644,
"logps/chosen": -535.6951293945312,
"logps/rejected": -634.7950439453125,
"loss": 0.495,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.6694915294647217,
"rewards/margins": 1.0715022087097168,
"rewards/rejected": -3.7409939765930176,
"step": 3340
},
{
"epoch": 0.8767338393090814,
"grad_norm": 9.759672164916992,
"learning_rate": 2.2799507522944048e-07,
"logits/chosen": -1.2516978979110718,
"logits/rejected": -1.143761396408081,
"logps/chosen": -506.92779541015625,
"logps/rejected": -630.4886474609375,
"loss": 0.4682,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4124372005462646,
"rewards/margins": 1.3180114030838013,
"rewards/rejected": -3.7304489612579346,
"step": 3350
},
{
"epoch": 0.8793509552473174,
"grad_norm": 10.614029884338379,
"learning_rate": 2.1855865323510056e-07,
"logits/chosen": -1.3051875829696655,
"logits/rejected": -1.0945546627044678,
"logps/chosen": -524.2318115234375,
"logps/rejected": -672.577880859375,
"loss": 0.4244,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.515920639038086,
"rewards/margins": 1.5423296689987183,
"rewards/rejected": -4.058249473571777,
"step": 3360
},
{
"epoch": 0.8819680711855535,
"grad_norm": 7.248552322387695,
"learning_rate": 2.0931273093666575e-07,
"logits/chosen": -1.2330162525177002,
"logits/rejected": -1.0763533115386963,
"logps/chosen": -503.97393798828125,
"logps/rejected": -626.6051025390625,
"loss": 0.4239,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.638221025466919,
"rewards/margins": 1.4292933940887451,
"rewards/rejected": -4.067514419555664,
"step": 3370
},
{
"epoch": 0.8845851871237895,
"grad_norm": 15.31811237335205,
"learning_rate": 2.002580803659873e-07,
"logits/chosen": -1.244128942489624,
"logits/rejected": -1.1012144088745117,
"logps/chosen": -522.0372924804688,
"logps/rejected": -629.9783325195312,
"loss": 0.4693,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.676211357116699,
"rewards/margins": 1.2609608173370361,
"rewards/rejected": -3.9371724128723145,
"step": 3380
},
{
"epoch": 0.8872023030620256,
"grad_norm": 5.824941635131836,
"learning_rate": 1.913954575837826e-07,
"logits/chosen": -1.3020130395889282,
"logits/rejected": -1.055781602859497,
"logps/chosen": -542.0621337890625,
"logps/rejected": -616.4013061523438,
"loss": 0.4735,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.7002296447753906,
"rewards/margins": 1.2402583360671997,
"rewards/rejected": -3.9404876232147217,
"step": 3390
},
{
"epoch": 0.8898194190002617,
"grad_norm": 9.155735969543457,
"learning_rate": 1.827256026165028e-07,
"logits/chosen": -1.3028671741485596,
"logits/rejected": -1.1115076541900635,
"logps/chosen": -565.5799560546875,
"logps/rejected": -653.9213256835938,
"loss": 0.4272,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.5059080123901367,
"rewards/margins": 1.455091118812561,
"rewards/rejected": -3.960999011993408,
"step": 3400
},
{
"epoch": 0.8898194190002617,
"eval_logits/chosen": -1.1445426940917969,
"eval_logits/rejected": -1.0071464776992798,
"eval_logps/chosen": -536.8281860351562,
"eval_logps/rejected": -647.4435424804688,
"eval_loss": 0.47965455055236816,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -2.722262382507324,
"eval_rewards/margins": 1.306469440460205,
"eval_rewards/rejected": -4.0287322998046875,
"eval_runtime": 1597.1181,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.157,
"step": 3400
},
{
"epoch": 0.8924365349384977,
"grad_norm": 9.588942527770996,
"learning_rate": 1.7424923939454274e-07,
"logits/chosen": -1.2525078058242798,
"logits/rejected": -1.0694096088409424,
"logps/chosen": -553.6488037109375,
"logps/rejected": -648.3309326171875,
"loss": 0.4282,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.723491668701172,
"rewards/margins": 1.380472183227539,
"rewards/rejected": -4.103963851928711,
"step": 3410
},
{
"epoch": 0.8950536508767338,
"grad_norm": 19.193740844726562,
"learning_rate": 1.6596707569179304e-07,
"logits/chosen": -1.3545329570770264,
"logits/rejected": -1.1915156841278076,
"logps/chosen": -562.5875244140625,
"logps/rejected": -653.5098876953125,
"loss": 0.4928,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.815075635910034,
"rewards/margins": 1.2605375051498413,
"rewards/rejected": -4.075612545013428,
"step": 3420
},
{
"epoch": 0.8976707668149699,
"grad_norm": 14.299762725830078,
"learning_rate": 1.578798030665385e-07,
"logits/chosen": -1.283125638961792,
"logits/rejected": -1.0879403352737427,
"logps/chosen": -551.2545166015625,
"logps/rejected": -682.09228515625,
"loss": 0.4447,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.7663490772247314,
"rewards/margins": 1.4793423414230347,
"rewards/rejected": -4.245691776275635,
"step": 3430
},
{
"epoch": 0.9002878827532059,
"grad_norm": 8.79478645324707,
"learning_rate": 1.499880968037165e-07,
"logits/chosen": -1.2685706615447998,
"logits/rejected": -1.1165311336517334,
"logps/chosen": -530.1332397460938,
"logps/rejected": -616.2289428710938,
"loss": 0.5132,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.7463672161102295,
"rewards/margins": 1.2237727642059326,
"rewards/rejected": -3.970139980316162,
"step": 3440
},
{
"epoch": 0.902904998691442,
"grad_norm": 14.623788833618164,
"learning_rate": 1.4229261585852805e-07,
"logits/chosen": -1.298165202140808,
"logits/rejected": -1.205263614654541,
"logps/chosen": -537.5665283203125,
"logps/rejected": -645.2786865234375,
"loss": 0.4557,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.692805528640747,
"rewards/margins": 1.3347504138946533,
"rewards/rejected": -4.0275559425354,
"step": 3450
},
{
"epoch": 0.9055221146296781,
"grad_norm": 11.80216121673584,
"learning_rate": 1.3479400280141886e-07,
"logits/chosen": -1.2136515378952026,
"logits/rejected": -1.1657497882843018,
"logps/chosen": -525.5947265625,
"logps/rejected": -662.1110229492188,
"loss": 0.4657,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.7938077449798584,
"rewards/margins": 1.385801076889038,
"rewards/rejected": -4.1796088218688965,
"step": 3460
},
{
"epoch": 0.9081392305679141,
"grad_norm": 9.422633171081543,
"learning_rate": 1.2749288376442044e-07,
"logits/chosen": -1.3240694999694824,
"logits/rejected": -1.106979489326477,
"logps/chosen": -565.2491455078125,
"logps/rejected": -643.6472778320312,
"loss": 0.4631,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.6884024143218994,
"rewards/margins": 1.3586628437042236,
"rewards/rejected": -4.047064781188965,
"step": 3470
},
{
"epoch": 0.9107563465061502,
"grad_norm": 9.699939727783203,
"learning_rate": 1.203898683888713e-07,
"logits/chosen": -1.306654691696167,
"logits/rejected": -1.1677879095077515,
"logps/chosen": -532.5252685546875,
"logps/rejected": -648.9597778320312,
"loss": 0.5443,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.886373519897461,
"rewards/margins": 1.1954319477081299,
"rewards/rejected": -4.081805229187012,
"step": 3480
},
{
"epoch": 0.9133734624443863,
"grad_norm": 11.356287002563477,
"learning_rate": 1.1348554977451132e-07,
"logits/chosen": -1.3395811319351196,
"logits/rejected": -1.1923797130584717,
"logps/chosen": -551.8304443359375,
"logps/rejected": -638.4985961914062,
"loss": 0.5142,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.699251651763916,
"rewards/margins": 1.195854902267456,
"rewards/rejected": -3.895106792449951,
"step": 3490
},
{
"epoch": 0.9159905783826223,
"grad_norm": 10.900007247924805,
"learning_rate": 1.0678050442995802e-07,
"logits/chosen": -1.306223750114441,
"logits/rejected": -1.088254690170288,
"logps/chosen": -554.8663330078125,
"logps/rejected": -634.9073486328125,
"loss": 0.5272,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7420237064361572,
"rewards/margins": 1.2511793375015259,
"rewards/rejected": -3.9932029247283936,
"step": 3500
},
{
"epoch": 0.9159905783826223,
"eval_logits/chosen": -1.160068154335022,
"eval_logits/rejected": -1.023296594619751,
"eval_logps/chosen": -536.0448608398438,
"eval_logps/rejected": -647.7730102539062,
"eval_loss": 0.4797233045101166,
"eval_rewards/accuracies": 0.746999979019165,
"eval_rewards/chosen": -2.7144289016723633,
"eval_rewards/margins": 1.3175978660583496,
"eval_rewards/rejected": -4.032026767730713,
"eval_runtime": 1597.7222,
"eval_samples_per_second": 1.252,
"eval_steps_per_second": 0.156,
"step": 3500
},
{
"epoch": 0.9186076943208584,
"grad_norm": 9.065542221069336,
"learning_rate": 1.0027529222456755e-07,
"logits/chosen": -1.2851347923278809,
"logits/rejected": -1.0965713262557983,
"logps/chosen": -514.8778076171875,
"logps/rejected": -634.3299560546875,
"loss": 0.4078,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.6071436405181885,
"rewards/margins": 1.371249794960022,
"rewards/rejected": -3.978393077850342,
"step": 3510
},
{
"epoch": 0.9212248102590945,
"grad_norm": 12.434161186218262,
"learning_rate": 9.397045634168766e-08,
"logits/chosen": -1.3080555200576782,
"logits/rejected": -1.222429633140564,
"logps/chosen": -531.6901245117188,
"logps/rejected": -675.6932373046875,
"loss": 0.4615,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.632573366165161,
"rewards/margins": 1.43732750415802,
"rewards/rejected": -4.069900989532471,
"step": 3520
},
{
"epoch": 0.9238419261973305,
"grad_norm": 16.84272003173828,
"learning_rate": 8.78665232332998e-08,
"logits/chosen": -1.243849515914917,
"logits/rejected": -1.1529252529144287,
"logps/chosen": -504.6124572753906,
"logps/rejected": -625.7620849609375,
"loss": 0.474,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.7395050525665283,
"rewards/margins": 1.2195281982421875,
"rewards/rejected": -3.959033489227295,
"step": 3530
},
{
"epoch": 0.9264590421355666,
"grad_norm": 8.225701332092285,
"learning_rate": 8.196400257606208e-08,
"logits/chosen": -1.348672866821289,
"logits/rejected": -1.1730903387069702,
"logps/chosen": -544.8364868164062,
"logps/rejected": -697.6771240234375,
"loss": 0.4144,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.6183691024780273,
"rewards/margins": 1.5808098316192627,
"rewards/rejected": -4.199179172515869,
"step": 3540
},
{
"epoch": 0.9290761580738026,
"grad_norm": 11.679130554199219,
"learning_rate": 7.626338722875076e-08,
"logits/chosen": -1.2883799076080322,
"logits/rejected": -1.2092903852462769,
"logps/chosen": -517.6767578125,
"logps/rejected": -647.9732666015625,
"loss": 0.4827,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.6364307403564453,
"rewards/margins": 1.2941230535507202,
"rewards/rejected": -3.930554151535034,
"step": 3550
},
{
"epoch": 0.9316932740120387,
"grad_norm": 5.509209632873535,
"learning_rate": 7.076515319110688e-08,
"logits/chosen": -1.283998727798462,
"logits/rejected": -1.1587374210357666,
"logps/chosen": -517.294677734375,
"logps/rejected": -615.6148071289062,
"loss": 0.5114,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6106066703796387,
"rewards/margins": 1.3990845680236816,
"rewards/rejected": -4.00969123840332,
"step": 3560
},
{
"epoch": 0.9343103899502748,
"grad_norm": 7.372361660003662,
"learning_rate": 6.54697595640899e-08,
"logits/chosen": -1.3087977170944214,
"logits/rejected": -1.178120493888855,
"logps/chosen": -560.3231201171875,
"logps/rejected": -660.3911743164062,
"loss": 0.4774,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.6826512813568115,
"rewards/margins": 1.2511831521987915,
"rewards/rejected": -3.9338345527648926,
"step": 3570
},
{
"epoch": 0.9369275058885108,
"grad_norm": 9.634334564208984,
"learning_rate": 6.037764851154426e-08,
"logits/chosen": -1.3032505512237549,
"logits/rejected": -1.231890320777893,
"logps/chosen": -522.193603515625,
"logps/rejected": -652.8558349609375,
"loss": 0.4929,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.5825421810150146,
"rewards/margins": 1.2584049701690674,
"rewards/rejected": -3.840946912765503,
"step": 3580
},
{
"epoch": 0.9395446218267469,
"grad_norm": 7.305212497711182,
"learning_rate": 5.548924522327748e-08,
"logits/chosen": -1.2807940244674683,
"logits/rejected": -1.1494871377944946,
"logps/chosen": -517.2708740234375,
"logps/rejected": -632.7682495117188,
"loss": 0.4594,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.5563392639160156,
"rewards/margins": 1.3009912967681885,
"rewards/rejected": -3.857330799102783,
"step": 3590
},
{
"epoch": 0.942161737764983,
"grad_norm": 13.96353530883789,
"learning_rate": 5.0804957879556915e-08,
"logits/chosen": -1.198561429977417,
"logits/rejected": -1.0997329950332642,
"logps/chosen": -484.185546875,
"logps/rejected": -620.9610595703125,
"loss": 0.4441,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.587231397628784,
"rewards/margins": 1.3218923807144165,
"rewards/rejected": -3.909123659133911,
"step": 3600
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -1.164100170135498,
"eval_logits/rejected": -1.0277760028839111,
"eval_logps/chosen": -529.1943969726562,
"eval_logps/rejected": -639.7042846679688,
"eval_loss": 0.4790266156196594,
"eval_rewards/accuracies": 0.746999979019165,
"eval_rewards/chosen": -2.6459240913391113,
"eval_rewards/margins": 1.3054152727127075,
"eval_rewards/rejected": -3.9513394832611084,
"eval_runtime": 1598.6642,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 3600
},
{
"epoch": 0.944778853703219,
"grad_norm": 9.906332969665527,
"learning_rate": 4.632517761702815e-08,
"logits/chosen": -1.2350178956985474,
"logits/rejected": -1.0819157361984253,
"logps/chosen": -498.882568359375,
"logps/rejected": -640.2724609375,
"loss": 0.4371,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.635540008544922,
"rewards/margins": 1.536821722984314,
"rewards/rejected": -4.172361850738525,
"step": 3610
},
{
"epoch": 0.9473959696414551,
"grad_norm": 12.568668365478516,
"learning_rate": 4.205027849605359e-08,
"logits/chosen": -1.2538177967071533,
"logits/rejected": -1.1405234336853027,
"logps/chosen": -525.0173950195312,
"logps/rejected": -613.74560546875,
"loss": 0.5461,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.771757125854492,
"rewards/margins": 1.1880303621292114,
"rewards/rejected": -3.959787368774414,
"step": 3620
},
{
"epoch": 0.9500130855796912,
"grad_norm": 8.876080513000488,
"learning_rate": 3.798061746947995e-08,
"logits/chosen": -1.373834252357483,
"logits/rejected": -1.228161334991455,
"logps/chosen": -527.4785766601562,
"logps/rejected": -619.109375,
"loss": 0.4818,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.6347904205322266,
"rewards/margins": 1.3071154356002808,
"rewards/rejected": -3.941905975341797,
"step": 3630
},
{
"epoch": 0.9526302015179272,
"grad_norm": 9.864373207092285,
"learning_rate": 3.411653435283158e-08,
"logits/chosen": -1.29331636428833,
"logits/rejected": -1.077043056488037,
"logps/chosen": -534.7817993164062,
"logps/rejected": -597.4425048828125,
"loss": 0.4952,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.6166446208953857,
"rewards/margins": 1.1832177639007568,
"rewards/rejected": -3.7998623847961426,
"step": 3640
},
{
"epoch": 0.9552473174561633,
"grad_norm": 12.852518081665039,
"learning_rate": 3.04583517959367e-08,
"logits/chosen": -1.339290976524353,
"logits/rejected": -1.1780240535736084,
"logps/chosen": -494.443603515625,
"logps/rejected": -590.2618408203125,
"loss": 0.4506,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.4245123863220215,
"rewards/margins": 1.2823518514633179,
"rewards/rejected": -3.70686411857605,
"step": 3650
},
{
"epoch": 0.9578644333943994,
"grad_norm": 9.657252311706543,
"learning_rate": 2.7006375255985984e-08,
"logits/chosen": -1.281280755996704,
"logits/rejected": -1.2413251399993896,
"logps/chosen": -539.2777099609375,
"logps/rejected": -645.9277954101562,
"loss": 0.5566,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.7454452514648438,
"rewards/margins": 1.0612398386001587,
"rewards/rejected": -3.8066844940185547,
"step": 3660
},
{
"epoch": 0.9604815493326354,
"grad_norm": 12.267367362976074,
"learning_rate": 2.3760892972027328e-08,
"logits/chosen": -1.3983352184295654,
"logits/rejected": -1.225462794303894,
"logps/chosen": -544.6994018554688,
"logps/rejected": -638.8689575195312,
"loss": 0.5249,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.7232279777526855,
"rewards/margins": 1.2808904647827148,
"rewards/rejected": -4.0041184425354,
"step": 3670
},
{
"epoch": 0.9630986652708715,
"grad_norm": 13.94206714630127,
"learning_rate": 2.072217594089765e-08,
"logits/chosen": -1.2447559833526611,
"logits/rejected": -1.2226978540420532,
"logps/chosen": -527.1612548828125,
"logps/rejected": -661.6229248046875,
"loss": 0.3918,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.6566879749298096,
"rewards/margins": 1.467563271522522,
"rewards/rejected": -4.124251365661621,
"step": 3680
},
{
"epoch": 0.9657157812091076,
"grad_norm": 8.289405822753906,
"learning_rate": 1.789047789459375e-08,
"logits/chosen": -1.3621820211410522,
"logits/rejected": -1.1523287296295166,
"logps/chosen": -576.5960693359375,
"logps/rejected": -663.8076782226562,
"loss": 0.5052,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.597318172454834,
"rewards/margins": 1.3620169162750244,
"rewards/rejected": -3.9593348503112793,
"step": 3690
},
{
"epoch": 0.9683328971473436,
"grad_norm": 7.745994567871094,
"learning_rate": 1.5266035279088708e-08,
"logits/chosen": -1.1971657276153564,
"logits/rejected": -1.0677430629730225,
"logps/chosen": -573.20361328125,
"logps/rejected": -680.00244140625,
"loss": 0.4823,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.7669663429260254,
"rewards/margins": 1.330165147781372,
"rewards/rejected": -4.097131729125977,
"step": 3700
},
{
"epoch": 0.9683328971473436,
"eval_logits/chosen": -1.1687482595443726,
"eval_logits/rejected": -1.0329276323318481,
"eval_logps/chosen": -527.3952026367188,
"eval_logps/rejected": -637.1880493164062,
"eval_loss": 0.47885680198669434,
"eval_rewards/accuracies": 0.7480000257492065,
"eval_rewards/chosen": -2.627932548522949,
"eval_rewards/margins": 1.298244595527649,
"eval_rewards/rejected": -3.9261767864227295,
"eval_runtime": 1598.696,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 3700
},
{
"epoch": 0.9709500130855797,
"grad_norm": 18.376056671142578,
"learning_rate": 1.2849067234584623e-08,
"logits/chosen": -1.182948112487793,
"logits/rejected": -1.10740065574646,
"logps/chosen": -496.62152099609375,
"logps/rejected": -624.2833862304688,
"loss": 0.4808,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.603543281555176,
"rewards/margins": 1.3529905080795288,
"rewards/rejected": -3.956533908843994,
"step": 3710
},
{
"epoch": 0.9735671290238157,
"grad_norm": 9.365938186645508,
"learning_rate": 1.0639775577218625e-08,
"logits/chosen": -1.1770192384719849,
"logits/rejected": -1.0035854578018188,
"logps/chosen": -516.5242309570312,
"logps/rejected": -607.8884887695312,
"loss": 0.5282,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6737780570983887,
"rewards/margins": 1.2736929655075073,
"rewards/rejected": -3.9474711418151855,
"step": 3720
},
{
"epoch": 0.9761842449620518,
"grad_norm": 11.24964427947998,
"learning_rate": 8.638344782207486e-09,
"logits/chosen": -1.1972987651824951,
"logits/rejected": -1.0941554307937622,
"logps/chosen": -499.6304626464844,
"logps/rejected": -600.4740600585938,
"loss": 0.4853,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.5446014404296875,
"rewards/margins": 1.2359497547149658,
"rewards/rejected": -3.7805511951446533,
"step": 3730
},
{
"epoch": 0.9788013609002879,
"grad_norm": 10.543977737426758,
"learning_rate": 6.84494196844715e-09,
"logits/chosen": -1.2653145790100098,
"logits/rejected": -1.1376771926879883,
"logps/chosen": -532.8594970703125,
"logps/rejected": -669.3706665039062,
"loss": 0.4524,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.608578681945801,
"rewards/margins": 1.4806853532791138,
"rewards/rejected": -4.089264392852783,
"step": 3740
},
{
"epoch": 0.9814184768385239,
"grad_norm": 8.994680404663086,
"learning_rate": 5.259716884556121e-09,
"logits/chosen": -1.3201556205749512,
"logits/rejected": -1.1730735301971436,
"logps/chosen": -524.9544677734375,
"logps/rejected": -640.7349853515625,
"loss": 0.451,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6050658226013184,
"rewards/margins": 1.3061447143554688,
"rewards/rejected": -3.911210536956787,
"step": 3750
},
{
"epoch": 0.98403559277676,
"grad_norm": 9.462129592895508,
"learning_rate": 3.882801896372967e-09,
"logits/chosen": -1.3206380605697632,
"logits/rejected": -1.223382830619812,
"logps/chosen": -523.351318359375,
"logps/rejected": -627.8754272460938,
"loss": 0.4966,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.58086895942688,
"rewards/margins": 1.3636963367462158,
"rewards/rejected": -3.9445652961730957,
"step": 3760
},
{
"epoch": 0.9866527087149961,
"grad_norm": 11.428803443908691,
"learning_rate": 2.7143119759026614e-09,
"logits/chosen": -1.3305742740631104,
"logits/rejected": -1.1541904211044312,
"logps/chosen": -536.9759521484375,
"logps/rejected": -636.7061157226562,
"loss": 0.4246,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.528337001800537,
"rewards/margins": 1.2519800662994385,
"rewards/rejected": -3.7803173065185547,
"step": 3770
},
{
"epoch": 0.9892698246532321,
"grad_norm": 10.81966781616211,
"learning_rate": 1.754344691717591e-09,
"logits/chosen": -1.2256147861480713,
"logits/rejected": -1.1796106100082397,
"logps/chosen": -514.3230590820312,
"logps/rejected": -642.4959716796875,
"loss": 0.5022,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.645564556121826,
"rewards/margins": 1.0614073276519775,
"rewards/rejected": -3.7069716453552246,
"step": 3780
},
{
"epoch": 0.9918869405914682,
"grad_norm": 16.201265335083008,
"learning_rate": 1.0029802008096335e-09,
"logits/chosen": -1.2533369064331055,
"logits/rejected": -1.0869061946868896,
"logps/chosen": -542.9913330078125,
"logps/rejected": -650.4954833984375,
"loss": 0.4961,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.687774181365967,
"rewards/margins": 1.3033173084259033,
"rewards/rejected": -3.99109148979187,
"step": 3790
},
{
"epoch": 0.9945040565297043,
"grad_norm": 7.363870143890381,
"learning_rate": 4.602812418974534e-10,
"logits/chosen": -1.3605427742004395,
"logits/rejected": -1.2153841257095337,
"logps/chosen": -546.0870971679688,
"logps/rejected": -652.19921875,
"loss": 0.4996,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.6433169841766357,
"rewards/margins": 1.3066623210906982,
"rewards/rejected": -3.949979305267334,
"step": 3800
},
{
"epoch": 0.9945040565297043,
"eval_logits/chosen": -1.1657898426055908,
"eval_logits/rejected": -1.0296279191970825,
"eval_logps/chosen": -526.756103515625,
"eval_logps/rejected": -636.4028930664062,
"eval_loss": 0.4788345396518707,
"eval_rewards/accuracies": 0.7475000023841858,
"eval_rewards/chosen": -2.62154221534729,
"eval_rewards/margins": 1.2967824935913086,
"eval_rewards/rejected": -3.9183249473571777,
"eval_runtime": 1598.3049,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 3800
},
{
"epoch": 0.9971211724679403,
"grad_norm": 12.966708183288574,
"learning_rate": 1.2629313018819312e-10,
"logits/chosen": -1.2771844863891602,
"logits/rejected": -1.147637128829956,
"logps/chosen": -510.83001708984375,
"logps/rejected": -608.63037109375,
"loss": 0.503,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5848309993743896,
"rewards/margins": 1.134263277053833,
"rewards/rejected": -3.7190945148468018,
"step": 3810
},
{
"epoch": 0.9997382884061764,
"grad_norm": 16.982664108276367,
"learning_rate": 1.0437535929996855e-12,
"logits/chosen": -1.262603998184204,
"logits/rejected": -1.0868072509765625,
"logps/chosen": -551.4014892578125,
"logps/rejected": -662.4216918945312,
"loss": 0.4568,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.6468563079833984,
"rewards/margins": 1.5578194856643677,
"rewards/rejected": -4.204675674438477,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 0.0,
"train_loss": 0.517807064771465,
"train_runtime": 164396.369,
"train_samples_per_second": 0.372,
"train_steps_per_second": 0.023
}
],
"logging_steps": 10,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}