{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0283877878950187, "eval_steps": 64, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021424745581146223, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 12.50527462559239, "learning_rate": 0.0, "logits/chosen": 1.0446698665618896, "logits/rejected": 0.9781918525695801, "logps/accuracies": 0.25, "logps/chosen": -270.4280700683594, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -270.4280700683594, "logps/ref_rejected": -259.14373779296875, "logps/rejected": -259.14373779296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/grad_term": 0.02500000037252903, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004284949116229245, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.856352015281924, "learning_rate": 1.0679540942081149e-07, "logits/chosen": 0.4414063096046448, "logits/rejected": 0.32948625087738037, "logps/accuracies": 0.5, "logps/chosen": -318.21771240234375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -317.62615966796875, "logps/ref_rejected": -221.48974609375, "logps/rejected": -221.3629608154297, "loss": 0.69, "rewards/accuracies": 0.25, "rewards/chosen": -0.02957625314593315, "rewards/grad_term": 0.0254487507045269, "rewards/margins": -0.03591585159301758, "rewards/rejected": 0.006339598447084427, "step": 2 }, { "epoch": 0.006427423674343867, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 13.280048310716381, "learning_rate": 1.6926671918114913e-07, "logits/chosen": 1.0256762504577637, "logits/rejected": 0.8810745477676392, "logps/accuracies": 0.5, "logps/chosen": -355.5387268066406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -354.57049560546875, "logps/ref_rejected": -364.0948486328125, "logps/rejected": -363.83416748046875, "loss": 0.6892, "rewards/accuracies": 0.0, "rewards/chosen": -0.0484108030796051, "rewards/grad_term": 0.0257670097053051, "rewards/margins": -0.061444856226444244, "rewards/rejected": 0.013034057803452015, "step": 3 }, { "epoch": 0.00856989823245849, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.710494813385088, "learning_rate": 2.1359081884162297e-07, "logits/chosen": 1.1384323835372925, "logits/rejected": 1.0655404329299927, "logps/accuracies": 0.5, "logps/chosen": -442.36578369140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -442.594970703125, "logps/ref_rejected": -345.18572998046875, "logps/rejected": -344.8496398925781, "loss": 0.6921, "rewards/accuracies": 0.75, "rewards/chosen": 0.0114593505859375, "rewards/grad_term": 0.0250665545463562, "rewards/margins": -0.005344009958207607, "rewards/rejected": 0.016803361475467682, "step": 4 }, { "epoch": 0.010712372790573112, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 14.937463503106938, "learning_rate": 2.479712615391807e-07, "logits/chosen": 0.5404180288314819, "logits/rejected": 0.45622000098228455, "logps/accuracies": 0.0, "logps/chosen": -413.39813232421875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -413.3487854003906, "logps/ref_rejected": -304.2044982910156, "logps/rejected": -304.66510009765625, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": -0.00246772775426507, "rewards/grad_term": 0.024743197485804558, "rewards/margins": 0.020562361925840378, "rewards/rejected": -0.02303009107708931, "step": 5 }, { "epoch": 0.012854847348687734, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.517831245113905, "learning_rate": 2.7606212860196063e-07, "logits/chosen": 1.1012723445892334, "logits/rejected": 0.7194727659225464, "logps/accuracies": 0.5, "logps/chosen": -258.80072021484375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -258.87176513671875, "logps/ref_rejected": -247.64498901367188, "logps/rejected": -247.63116455078125, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.0035531027242541313, "rewards/grad_term": 0.024964267387986183, "rewards/margins": 0.002861691638827324, "rewards/rejected": 0.0006914124824106693, "step": 6 }, { "epoch": 0.014997321906802356, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 12.204086479352032, "learning_rate": 2.9981261829067217e-07, "logits/chosen": 1.0940355062484741, "logits/rejected": 0.9565569162368774, "logps/accuracies": 0.25, "logps/chosen": -237.1595916748047, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -236.5460662841797, "logps/ref_rejected": -215.79209899902344, "logps/rejected": -215.83944702148438, "loss": 0.6913, "rewards/accuracies": 0.25, "rewards/chosen": -0.030676748603582382, "rewards/grad_term": 0.025353606790304184, "rewards/margins": -0.02830987237393856, "rewards/rejected": -0.0023668762296438217, "step": 7 }, { "epoch": 0.01713979646491698, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.475151458421857, "learning_rate": 3.2038622826243447e-07, "logits/chosen": 0.967343270778656, "logits/rejected": 0.9412274360656738, "logps/accuracies": 0.5, "logps/chosen": -279.015380859375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -279.0576171875, "logps/ref_rejected": -273.6654968261719, "logps/rejected": -273.8743591308594, "loss": 0.6941, "rewards/accuracies": 0.75, "rewards/chosen": 0.0021114349365234375, "rewards/grad_term": 0.02484307810664177, "rewards/margins": 0.01255502738058567, "rewards/rejected": -0.010443592444062233, "step": 8 }, { "epoch": 0.0192822710230316, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.580237634234056, "learning_rate": 3.3853343836229826e-07, "logits/chosen": 0.9568102359771729, "logits/rejected": 0.9983876943588257, "logps/accuracies": 0.5, "logps/chosen": -331.88922119140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -332.0152587890625, "logps/ref_rejected": -332.083251953125, "logps/rejected": -331.71844482421875, "loss": 0.6903, "rewards/accuracies": 0.25, "rewards/chosen": 0.00630226219072938, "rewards/grad_term": 0.02514958195388317, "rewards/margins": -0.011937713250517845, "rewards/rejected": 0.018239974975585938, "step": 9 }, { "epoch": 0.021424745581146223, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 13.151098903300593, "learning_rate": 3.5476667095999224e-07, "logits/chosen": 1.000882863998413, "logits/rejected": 0.9467081427574158, "logps/accuracies": 0.5, "logps/chosen": -320.8128662109375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -320.7702941894531, "logps/ref_rejected": -298.6964111328125, "logps/rejected": -298.3901672363281, "loss": 0.6846, "rewards/accuracies": 0.75, "rewards/chosen": -0.0021286013070493937, "rewards/grad_term": 0.025217382237315178, "rewards/margins": -0.017440060153603554, "rewards/rejected": 0.01531145628541708, "step": 10 }, { "epoch": 0.023567220139260846, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.233476785972572, "learning_rate": 3.6945141607567076e-07, "logits/chosen": 1.1362941265106201, "logits/rejected": 1.0800057649612427, "logps/accuracies": 0.5, "logps/chosen": -398.43902587890625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -399.17889404296875, "logps/ref_rejected": -400.2832946777344, "logps/rejected": -400.0302429199219, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": 0.036993030458688736, "rewards/grad_term": 0.024695834144949913, "rewards/margins": 0.024341586977243423, "rewards/rejected": 0.012651442550122738, "step": 11 }, { "epoch": 0.025709694697375468, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 12.317733670973258, "learning_rate": 3.8285753802277215e-07, "logits/chosen": 1.1415050029754639, "logits/rejected": 0.6014249920845032, "logps/accuracies": 0.0, "logps/chosen": -187.37680053710938, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -187.9676513671875, "logps/ref_rejected": -126.48046875, "logps/rejected": -126.51386260986328, "loss": 0.685, "rewards/accuracies": 0.75, "rewards/chosen": 0.0295425895601511, "rewards/grad_term": 0.02461005374789238, "rewards/margins": 0.03121213987469673, "rewards/rejected": -0.0016695503145456314, "step": 12 }, { "epoch": 0.02785216925549009, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 13.848164526581762, "learning_rate": 3.9518997473591026e-07, "logits/chosen": 1.0744524002075195, "logits/rejected": 0.8894139528274536, "logps/accuracies": 0.0, "logps/chosen": -277.2958984375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -278.2149658203125, "logps/ref_rejected": -189.09963989257812, "logps/rejected": -189.11630249023438, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": 0.04595203697681427, "rewards/grad_term": 0.02441561222076416, "rewards/margins": 0.04678481072187424, "rewards/rejected": -0.0008327784016728401, "step": 13 }, { "epoch": 0.029994643813604713, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 12.272371420817562, "learning_rate": 4.066080277114836e-07, "logits/chosen": 1.0568749904632568, "logits/rejected": 0.8716171979904175, "logps/accuracies": 0.0, "logps/chosen": -219.07720947265625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -219.25765991210938, "logps/ref_rejected": -129.22840881347656, "logps/rejected": -128.9102020263672, "loss": 0.686, "rewards/accuracies": 0.25, "rewards/chosen": 0.009022902697324753, "rewards/grad_term": 0.025086142122745514, "rewards/margins": -0.006888152565807104, "rewards/rejected": 0.015911055728793144, "step": 14 }, { "epoch": 0.032137118371719335, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 12.428694983873564, "learning_rate": 4.1723798072032976e-07, "logits/chosen": 1.037872314453125, "logits/rejected": 0.9692336320877075, "logps/accuracies": 0.25, "logps/chosen": -381.4244384765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -382.68658447265625, "logps/ref_rejected": -322.65679931640625, "logps/rejected": -321.3078308105469, "loss": 0.6845, "rewards/accuracies": 0.5, "rewards/chosen": 0.06310494244098663, "rewards/grad_term": 0.02505427412688732, "rewards/margins": -0.004344702698290348, "rewards/rejected": 0.06744963675737381, "step": 15 }, { "epoch": 0.03427959292983396, "flips/correct->correct": 1.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 12.620364972338749, "learning_rate": 4.2718163768324594e-07, "logits/chosen": 0.9468654990196228, "logits/rejected": 1.170510172843933, "logps/accuracies": 1.0, "logps/chosen": -173.85870361328125, "logps/ref_accuracies": 1.0, "logps/ref_chosen": -174.64218139648438, "logps/ref_rejected": -222.02438354492188, "logps/rejected": -222.00958251953125, "loss": 0.6788, "rewards/accuracies": 0.75, "rewards/chosen": 0.03917388990521431, "rewards/grad_term": 0.024519937112927437, "rewards/margins": 0.03843364864587784, "rewards/rejected": 0.0007402412593364716, "step": 16 }, { "epoch": 0.03642206748794858, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.697678503172199, "learning_rate": 4.3652226762368345e-07, "logits/chosen": 1.0040562152862549, "logits/rejected": 0.9608211517333984, "logps/accuracies": 0.5, "logps/chosen": -253.23019409179688, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -254.5650177001953, "logps/ref_rejected": -273.43707275390625, "logps/rejected": -273.1368713378906, "loss": 0.6744, "rewards/accuracies": 0.75, "rewards/chosen": 0.06674174964427948, "rewards/grad_term": 0.024354225024580956, "rewards/margins": 0.051732055842876434, "rewards/rejected": 0.015009691938757896, "step": 17 }, { "epoch": 0.0385645420460632, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.78637513369074, "learning_rate": 4.4532884778310973e-07, "logits/chosen": 1.0503087043762207, "logits/rejected": 0.9166826009750366, "logps/accuracies": 0.5, "logps/chosen": -244.81471252441406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -244.83123779296875, "logps/ref_rejected": -240.30084228515625, "logps/rejected": -239.78488159179688, "loss": 0.6772, "rewards/accuracies": 0.25, "rewards/chosen": 0.0008262638584710658, "rewards/grad_term": 0.0253120306879282, "rewards/margins": -0.02497131936252117, "rewards/rejected": 0.025797583162784576, "step": 18 }, { "epoch": 0.040707016604177824, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 11.706003949713145, "learning_rate": 4.536591579881374e-07, "logits/chosen": 1.015830159187317, "logits/rejected": 0.9545015692710876, "logps/accuracies": 0.25, "logps/chosen": -264.29638671875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -266.071044921875, "logps/ref_rejected": -236.97666931152344, "logps/rejected": -236.77438354492188, "loss": 0.6778, "rewards/accuracies": 0.75, "rewards/chosen": 0.08873386681079865, "rewards/grad_term": 0.02401968464255333, "rewards/margins": 0.07861967384815216, "rewards/rejected": 0.01011419203132391, "step": 19 }, { "epoch": 0.04284949116229245, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.815048094751976, "learning_rate": 4.615620803808037e-07, "logits/chosen": 0.896416187286377, "logits/rejected": 1.0262924432754517, "logps/accuracies": 0.5, "logps/chosen": -294.67327880859375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -296.34197998046875, "logps/ref_rejected": -321.8549499511719, "logps/rejected": -321.5665283203125, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.08343505859375, "rewards/grad_term": 0.02414114400744438, "rewards/margins": 0.06901436299085617, "rewards/rejected": 0.014420699328184128, "step": 20 }, { "epoch": 0.04499196572040707, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 11.637316673579596, "learning_rate": 4.6907933747182127e-07, "logits/chosen": 0.8530393242835999, "logits/rejected": 0.6534068584442139, "logps/accuracies": 0.25, "logps/chosen": -276.7626647949219, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -277.6459655761719, "logps/ref_rejected": -269.14227294921875, "logps/rejected": -270.58624267578125, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.04416485130786896, "rewards/grad_term": 0.02355196699500084, "rewards/margins": 0.11636309325695038, "rewards/rejected": -0.07219824939966202, "step": 21 }, { "epoch": 0.04713444027852169, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.876590663836538, "learning_rate": 4.762468254964823e-07, "logits/chosen": 1.009377121925354, "logits/rejected": 0.9447416067123413, "logps/accuracies": 0.5, "logps/chosen": -317.7878723144531, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -319.1816711425781, "logps/ref_rejected": -311.2941589355469, "logps/rejected": -311.8822021484375, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 0.06969013810157776, "rewards/grad_term": 0.023765018209815025, "rewards/margins": 0.09909267723560333, "rewards/rejected": -0.029402542859315872, "step": 22 }, { "epoch": 0.049276914836636314, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 12.586762655265133, "learning_rate": 4.830956511375156e-07, "logits/chosen": 1.203920841217041, "logits/rejected": 1.1563231945037842, "logps/accuracies": 0.0, "logps/chosen": -436.044921875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -438.2389831542969, "logps/ref_rejected": -385.3761901855469, "logps/rejected": -385.66180419921875, "loss": 0.6674, "rewards/accuracies": 0.5, "rewards/chosen": 0.10970421135425568, "rewards/grad_term": 0.0234676580876112, "rewards/margins": 0.12398529052734375, "rewards/rejected": -0.014281081967055798, "step": 23 }, { "epoch": 0.051419389394750936, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 12.26080518238754, "learning_rate": 4.896529474435837e-07, "logits/chosen": 0.9966791868209839, "logits/rejected": 0.9108967185020447, "logps/accuracies": 0.25, "logps/chosen": -307.80572509765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -310.31866455078125, "logps/ref_rejected": -252.79197692871094, "logps/rejected": -252.8419189453125, "loss": 0.6669, "rewards/accuracies": 0.75, "rewards/chosen": 0.12564696371555328, "rewards/grad_term": 0.023410532623529434, "rewards/margins": 0.12814360857009888, "rewards/rejected": -0.002496624831110239, "step": 24 }, { "epoch": 0.05356186395286556, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.366243366820711, "learning_rate": 4.959425230783614e-07, "logits/chosen": 0.9740027785301208, "logits/rejected": 0.8190696239471436, "logps/accuracies": 0.5, "logps/chosen": -320.2606506347656, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -321.4267272949219, "logps/ref_rejected": -328.9568176269531, "logps/rejected": -328.89703369140625, "loss": 0.6599, "rewards/accuracies": 0.75, "rewards/chosen": 0.05830345302820206, "rewards/grad_term": 0.024309232831001282, "rewards/margins": 0.055313680320978165, "rewards/rejected": 0.0029897689819335938, "step": 25 }, { "epoch": 0.05570433851098018, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 11.08055129360116, "learning_rate": 5.019853841567218e-07, "logits/chosen": 1.1111705303192139, "logits/rejected": 0.6438971161842346, "logps/accuracies": 0.25, "logps/chosen": -187.502197265625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -188.0975341796875, "logps/ref_rejected": -102.05082702636719, "logps/rejected": -102.74217987060547, "loss": 0.6499, "rewards/accuracies": 0.75, "rewards/chosen": 0.029765892773866653, "rewards/grad_term": 0.024199258536100388, "rewards/margins": 0.06433363258838654, "rewards/rejected": -0.034567736089229584, "step": 26 }, { "epoch": 0.0578468130690948, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.88218195606169, "learning_rate": 5.078001575434473e-07, "logits/chosen": 0.791816771030426, "logits/rejected": 0.884813666343689, "logps/accuracies": 0.5, "logps/chosen": -189.6773223876953, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -190.02127075195312, "logps/ref_rejected": -203.477783203125, "logps/rejected": -203.25155639648438, "loss": 0.6377, "rewards/accuracies": 0.75, "rewards/chosen": 0.017197083681821823, "rewards/grad_term": 0.024926653131842613, "rewards/margins": 0.00588593352586031, "rewards/rejected": 0.011311152018606663, "step": 27 }, { "epoch": 0.059989287627209426, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 11.239511800333705, "learning_rate": 5.134034371322951e-07, "logits/chosen": 1.134675145149231, "logits/rejected": 0.9631079435348511, "logps/accuracies": 0.25, "logps/chosen": -266.0494079589844, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -267.859619140625, "logps/ref_rejected": -242.5032958984375, "logps/rejected": -243.47943115234375, "loss": 0.636, "rewards/accuracies": 0.75, "rewards/chosen": 0.09051056951284409, "rewards/grad_term": 0.023272007703781128, "rewards/margins": 0.13931767642498016, "rewards/rejected": -0.04880712181329727, "step": 28 }, { "epoch": 0.06213176218532405, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 11.25145831338842, "learning_rate": 5.188100693331704e-07, "logits/chosen": 1.1358017921447754, "logits/rejected": 0.5541727542877197, "logps/accuracies": 0.0, "logps/chosen": -365.2521057128906, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -368.4560546875, "logps/ref_rejected": -241.02734375, "logps/rejected": -240.83399963378906, "loss": 0.639, "rewards/accuracies": 1.0, "rewards/chosen": 0.1601976454257965, "rewards/grad_term": 0.0231227595359087, "rewards/margins": 0.1505315899848938, "rewards/rejected": 0.009666061028838158, "step": 29 }, { "epoch": 0.06427423674343867, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.7696981919874, "learning_rate": 5.240333901411414e-07, "logits/chosen": 1.0822885036468506, "logits/rejected": 0.7618290185928345, "logps/accuracies": 0.0, "logps/chosen": -302.0892028808594, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -303.5124816894531, "logps/ref_rejected": -231.44943237304688, "logps/rejected": -231.66343688964844, "loss": 0.6391, "rewards/accuracies": 0.75, "rewards/chosen": 0.07116356492042542, "rewards/grad_term": 0.023980939760804176, "rewards/margins": 0.08186331391334534, "rewards/rejected": -0.010699748061597347, "step": 30 }, { "epoch": 0.0664167113015533, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.976869632876706, "learning_rate": 5.2908542331884e-07, "logits/chosen": 1.2087453603744507, "logits/rejected": 0.821852445602417, "logps/accuracies": 0.0, "logps/chosen": -408.1747741699219, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -410.11907958984375, "logps/ref_rejected": -321.87982177734375, "logps/rejected": -322.6016540527344, "loss": 0.6316, "rewards/accuracies": 1.0, "rewards/chosen": 0.09721412509679794, "rewards/grad_term": 0.023339958861470222, "rewards/margins": 0.1333070695400238, "rewards/rejected": -0.03609294816851616, "step": 31 }, { "epoch": 0.06855918585966791, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.55127291702564, "learning_rate": 5.339770471040575e-07, "logits/chosen": 1.0062335729599, "logits/rejected": 0.8178822994232178, "logps/accuracies": 0.25, "logps/chosen": -251.82504272460938, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -256.1924743652344, "logps/ref_rejected": -205.9676971435547, "logps/rejected": -208.8406524658203, "loss": 0.6257, "rewards/accuracies": 0.75, "rewards/chosen": 0.21837130188941956, "rewards/grad_term": 0.020633019506931305, "rewards/margins": 0.36201906204223633, "rewards/rejected": -0.14364777505397797, "step": 32 }, { "epoch": 0.07070166041778254, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.480165494817408, "learning_rate": 5.387181352568199e-07, "logits/chosen": 0.5726549625396729, "logits/rejected": 0.4411008358001709, "logps/accuracies": 0.5, "logps/chosen": -123.8817138671875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -125.0975570678711, "logps/ref_rejected": -92.3137435913086, "logps/rejected": -92.87496948242188, "loss": 0.6302, "rewards/accuracies": 0.75, "rewards/chosen": 0.06079201400279999, "rewards/grad_term": 0.02389412932097912, "rewards/margins": 0.08885356783866882, "rewards/rejected": -0.028061550110578537, "step": 33 }, { "epoch": 0.07284413497589716, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.769954405702693, "learning_rate": 5.43317677044495e-07, "logits/chosen": 0.8887495994567871, "logits/rejected": 0.7524275779724121, "logps/accuracies": 0.5, "logps/chosen": -233.1437530517578, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -235.97918701171875, "logps/ref_rejected": -232.49705505371094, "logps/rejected": -233.62205505371094, "loss": 0.6253, "rewards/accuracies": 0.75, "rewards/chosen": 0.1417713165283203, "rewards/grad_term": 0.022557333111763, "rewards/margins": 0.1980208307504654, "rewards/rejected": -0.056249529123306274, "step": 34 }, { "epoch": 0.07498660953401179, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.153566860628073, "learning_rate": 5.477838798298528e-07, "logits/chosen": 0.9315862655639648, "logits/rejected": 1.0292893648147583, "logps/accuracies": 0.5, "logps/chosen": -176.9609375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -178.0975799560547, "logps/ref_rejected": -238.39404296875, "logps/rejected": -238.72592163085938, "loss": 0.6173, "rewards/accuracies": 0.75, "rewards/chosen": 0.05683126673102379, "rewards/grad_term": 0.02408386766910553, "rewards/margins": 0.07342477142810822, "rewards/rejected": -0.016593504697084427, "step": 35 }, { "epoch": 0.0771290840921264, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.29602350540091, "learning_rate": 5.521242572039213e-07, "logits/chosen": 1.1964863538742065, "logits/rejected": 1.0984711647033691, "logps/accuracies": 0.5, "logps/chosen": -322.5604248046875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -322.1260986328125, "logps/ref_rejected": -287.303466796875, "logps/rejected": -293.40313720703125, "loss": 0.6001, "rewards/accuracies": 1.0, "rewards/chosen": -0.02171630784869194, "rewards/grad_term": 0.021510563790798187, "rewards/margins": 0.28326815366744995, "rewards/rejected": -0.3049844801425934, "step": 36 }, { "epoch": 0.07927155865024103, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.139112324828863, "learning_rate": 5.563457050409681e-07, "logits/chosen": 1.173073410987854, "logits/rejected": 1.0843687057495117, "logps/accuracies": 0.75, "logps/chosen": -273.99566650390625, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -275.65478515625, "logps/ref_rejected": -284.9312438964844, "logps/rejected": -284.83282470703125, "loss": 0.6029, "rewards/accuracies": 0.75, "rewards/chosen": 0.08295460045337677, "rewards/grad_term": 0.024028297513723373, "rewards/margins": 0.07803288102149963, "rewards/rejected": 0.0049217212945222855, "step": 37 }, { "epoch": 0.08141403320835565, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 11.36236689748956, "learning_rate": 5.604545674089489e-07, "logits/chosen": 1.127380609512329, "logits/rejected": 0.8692578077316284, "logps/accuracies": 0.25, "logps/chosen": -628.7327270507812, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -633.2546997070312, "logps/ref_rejected": -454.33013916015625, "logps/rejected": -454.3887939453125, "loss": 0.6011, "rewards/accuracies": 0.5, "rewards/chosen": 0.22609534859657288, "rewards/grad_term": 0.022296732291579247, "rewards/margins": 0.22902806103229523, "rewards/rejected": -0.0029327282682061195, "step": 38 }, { "epoch": 0.08355650776647028, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.40205719855862, "learning_rate": 5.644566939170593e-07, "logits/chosen": 1.1477112770080566, "logits/rejected": 0.7197964191436768, "logps/accuracies": 0.0, "logps/chosen": -342.099609375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -345.7111511230469, "logps/ref_rejected": -252.51214599609375, "logps/rejected": -257.9091796875, "loss": 0.5936, "rewards/accuracies": 1.0, "rewards/chosen": 0.18057651817798615, "rewards/grad_term": 0.01955207623541355, "rewards/margins": 0.45042720437049866, "rewards/rejected": -0.2698506712913513, "step": 39 }, { "epoch": 0.0856989823245849, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.52698957163884, "learning_rate": 5.683574898016152e-07, "logits/chosen": 1.155718207359314, "logits/rejected": 0.9540653228759766, "logps/accuracies": 0.25, "logps/chosen": -353.8412780761719, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -355.0787353515625, "logps/ref_rejected": -313.3082580566406, "logps/rejected": -315.2515869140625, "loss": 0.5912, "rewards/accuracies": 0.75, "rewards/chosen": 0.0618743896484375, "rewards/grad_term": 0.023020360618829727, "rewards/margins": 0.15904179215431213, "rewards/rejected": -0.09716740250587463, "step": 40 }, { "epoch": 0.08784145688269952, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 14.94332500203197, "learning_rate": 5.721619598264776e-07, "logits/chosen": 1.1048097610473633, "logits/rejected": 0.8005577325820923, "logps/accuracies": 0.0, "logps/chosen": -317.59912109375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -313.39208984375, "logps/ref_rejected": -253.1265411376953, "logps/rejected": -262.5826416015625, "loss": 0.5854, "rewards/accuracies": 0.75, "rewards/chosen": -0.2103523313999176, "rewards/grad_term": 0.021769195795059204, "rewards/margins": 0.2624519467353821, "rewards/rejected": -0.4728042781352997, "step": 41 }, { "epoch": 0.08998393144081414, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.074479922271454, "learning_rate": 5.758747468926328e-07, "logits/chosen": 0.8858407735824585, "logits/rejected": 0.7222996354103088, "logps/accuracies": 0.5, "logps/chosen": -298.509521484375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -300.38153076171875, "logps/ref_rejected": -263.8077087402344, "logps/rejected": -261.6027526855469, "loss": 0.5645, "rewards/accuracies": 0.5, "rewards/chosen": 0.09360065311193466, "rewards/grad_term": 0.025205716490745544, "rewards/margins": -0.01664828509092331, "rewards/rejected": 0.11024895310401917, "step": 42 }, { "epoch": 0.09212640599892877, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.322575540859543, "learning_rate": 5.795001661041298e-07, "logits/chosen": 1.0528746843338013, "logits/rejected": 0.8013145923614502, "logps/accuracies": 0.25, "logps/chosen": -357.80035400390625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -354.52734375, "logps/ref_rejected": -306.75, "logps/rejected": -324.1343994140625, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": -0.16364938020706177, "rewards/grad_term": 0.016722146421670914, "rewards/margins": 0.7055709958076477, "rewards/rejected": -0.8692203760147095, "step": 43 }, { "epoch": 0.09426888055704338, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.88902402136389, "learning_rate": 5.830422349172938e-07, "logits/chosen": 1.1213502883911133, "logits/rejected": 0.7059850692749023, "logps/accuracies": 0.25, "logps/chosen": -306.6339416503906, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -308.36224365234375, "logps/ref_rejected": -222.45750427246094, "logps/rejected": -234.72048950195312, "loss": 0.5277, "rewards/accuracies": 1.0, "rewards/chosen": 0.08641643077135086, "rewards/grad_term": 0.016946371644735336, "rewards/margins": 0.6995644569396973, "rewards/rejected": -0.6131480932235718, "step": 44 }, { "epoch": 0.09641135511515801, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.994736837402936, "learning_rate": 5.865046999014789e-07, "logits/chosen": 1.0356709957122803, "logits/rejected": 0.9350219368934631, "logps/accuracies": 0.25, "logps/chosen": -449.03717041015625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -447.7042236328125, "logps/ref_rejected": -399.0120544433594, "logps/rejected": -408.2356262207031, "loss": 0.537, "rewards/accuracies": 1.0, "rewards/chosen": -0.06664619594812393, "rewards/grad_term": 0.0201456006616354, "rewards/margins": 0.3945322036743164, "rewards/rejected": -0.46117842197418213, "step": 45 }, { "epoch": 0.09855382967327263, "flips/correct->correct": 1.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.684993785164789, "learning_rate": 5.898910605583271e-07, "logits/chosen": 0.8869616389274597, "logits/rejected": 0.9591479301452637, "logps/accuracies": 1.0, "logps/chosen": -190.82815551757812, "logps/ref_accuracies": 1.0, "logps/ref_chosen": -184.7421112060547, "logps/ref_rejected": -232.9263153076172, "logps/rejected": -237.15127563476562, "loss": 0.5444, "rewards/accuracies": 0.5, "rewards/chosen": -0.30430203676223755, "rewards/grad_term": 0.026146598160266876, "rewards/margins": -0.0930541530251503, "rewards/rejected": -0.21124787628650665, "step": 46 }, { "epoch": 0.10069630423138726, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.511687293551553, "learning_rate": 5.932045905791884e-07, "logits/chosen": 0.9791369438171387, "logits/rejected": 0.7344577312469482, "logps/accuracies": 0.0, "logps/chosen": -380.52587890625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -378.03131103515625, "logps/ref_rejected": -319.94256591796875, "logps/rejected": -337.8666687011719, "loss": 0.5447, "rewards/accuracies": 1.0, "rewards/chosen": -0.12472762912511826, "rewards/grad_term": 0.01607554219663143, "rewards/margins": 0.7714786529541016, "rewards/rejected": -0.896206259727478, "step": 47 }, { "epoch": 0.10283877878950187, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.055964810316027, "learning_rate": 5.964483568643951e-07, "logits/chosen": 0.8970568776130676, "logits/rejected": 0.48237085342407227, "logps/accuracies": 0.5, "logps/chosen": -349.3356018066406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -343.4019775390625, "logps/ref_rejected": -257.5508117675781, "logps/rejected": -269.60809326171875, "loss": 0.5329, "rewards/accuracies": 0.75, "rewards/chosen": -0.2966794967651367, "rewards/grad_term": 0.021424874663352966, "rewards/margins": 0.30618318915367126, "rewards/rejected": -0.6028627157211304, "step": 48 }, { "epoch": 0.1049812533476165, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.617890748457961, "learning_rate": 5.996252365813443e-07, "logits/chosen": 0.9411455392837524, "logits/rejected": 0.5716761350631714, "logps/accuracies": 0.0, "logps/chosen": -416.63934326171875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -415.9534606933594, "logps/ref_rejected": -247.12747192382812, "logps/rejected": -250.519287109375, "loss": 0.5287, "rewards/accuracies": 0.5, "rewards/chosen": -0.03429294005036354, "rewards/grad_term": 0.023422496393322945, "rewards/margins": 0.1352972686290741, "rewards/rejected": -0.16959019005298615, "step": 49 }, { "epoch": 0.10712372790573112, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.56249795289356, "learning_rate": 6.02737932499173e-07, "logits/chosen": 0.7994714975357056, "logits/rejected": 0.42442572116851807, "logps/accuracies": 0.0, "logps/chosen": -441.199951171875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -438.83892822265625, "logps/ref_rejected": -322.400146484375, "logps/rejected": -338.1896667480469, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": -0.11804847419261932, "rewards/grad_term": 0.016969487071037292, "rewards/margins": 0.6714283227920532, "rewards/rejected": -0.789476752281189, "step": 50 }, { "epoch": 0.10926620246384575, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.886334758313449, "learning_rate": 6.057889868048325e-07, "logits/chosen": 1.0163636207580566, "logits/rejected": 0.8965986967086792, "logps/accuracies": 0.25, "logps/chosen": -416.2098083496094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -416.4342041015625, "logps/ref_rejected": -324.2137145996094, "logps/rejected": -339.7252502441406, "loss": 0.5316, "rewards/accuracies": 0.75, "rewards/chosen": 0.011221121996641159, "rewards/grad_term": 0.01711445301771164, "rewards/margins": 0.7867982983589172, "rewards/rejected": -0.775577187538147, "step": 51 }, { "epoch": 0.11140867702196036, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.545934339053032, "learning_rate": 6.087807935775333e-07, "logits/chosen": 0.5185865759849548, "logits/rejected": 0.3614073395729065, "logps/accuracies": 0.25, "logps/chosen": -229.86053466796875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -228.54452514648438, "logps/ref_rejected": -195.83494567871094, "logps/rejected": -206.37149047851562, "loss": 0.5191, "rewards/accuracies": 1.0, "rewards/chosen": -0.06580007076263428, "rewards/grad_term": 0.019602250307798386, "rewards/margins": 0.461027592420578, "rewards/rejected": -0.5268276929855347, "step": 52 }, { "epoch": 0.11355115158007499, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.703023398000125, "learning_rate": 6.117156100749175e-07, "logits/chosen": 0.9196311235427856, "logits/rejected": 0.8846197128295898, "logps/accuracies": 0.25, "logps/chosen": -436.0137023925781, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -429.5511474609375, "logps/ref_rejected": -402.48614501953125, "logps/rejected": -423.6464538574219, "loss": 0.5158, "rewards/accuracies": 1.0, "rewards/chosen": -0.32312700152397156, "rewards/grad_term": 0.016992483288049698, "rewards/margins": 0.7348867654800415, "rewards/rejected": -1.0580137968063354, "step": 53 }, { "epoch": 0.1156936261381896, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.613596866402307, "learning_rate": 6.145955669642588e-07, "logits/chosen": 1.0864933729171753, "logits/rejected": 1.0046788454055786, "logps/accuracies": 0.75, "logps/chosen": -408.01947021484375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -394.4793395996094, "logps/ref_rejected": -375.3567199707031, "logps/rejected": -408.60504150390625, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": -0.6770049929618835, "rewards/grad_term": 0.013774032704532146, "rewards/margins": 0.9854103326797485, "rewards/rejected": -1.6624153852462769, "step": 54 }, { "epoch": 0.11783610069630424, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.495375001244799, "learning_rate": 6.174226776148516e-07, "logits/chosen": 0.8830907940864563, "logits/rejected": 0.7350561618804932, "logps/accuracies": 0.25, "logps/chosen": -330.6471252441406, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -317.4583740234375, "logps/ref_rejected": -287.886962890625, "logps/rejected": -309.34942626953125, "loss": 0.4821, "rewards/accuracies": 0.75, "rewards/chosen": -0.6594364643096924, "rewards/grad_term": 0.020037367939949036, "rewards/margins": 0.41368618607521057, "rewards/rejected": -1.0731226205825806, "step": 55 }, { "epoch": 0.11997857525441885, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.135520472135779, "learning_rate": 6.201988465531067e-07, "logits/chosen": 0.6756561994552612, "logits/rejected": 0.6564769744873047, "logps/accuracies": 0.5, "logps/chosen": -181.22642517089844, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -179.98342895507812, "logps/ref_rejected": -201.45867919921875, "logps/rejected": -211.14109802246094, "loss": 0.5161, "rewards/accuracies": 0.75, "rewards/chosen": -0.06215019151568413, "rewards/grad_term": 0.019981056451797485, "rewards/margins": 0.421970933675766, "rewards/rejected": -0.4841211438179016, "step": 56 }, { "epoch": 0.12212104981253348, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.52811690075287, "learning_rate": 6.229258771692866e-07, "logits/chosen": 0.7419092655181885, "logits/rejected": 0.8074868321418762, "logps/accuracies": 0.5, "logps/chosen": -186.41204833984375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -182.34783935546875, "logps/ref_rejected": -165.51622009277344, "logps/rejected": -176.84945678710938, "loss": 0.4952, "rewards/accuracies": 0.75, "rewards/chosen": -0.2032102644443512, "rewards/grad_term": 0.02091275155544281, "rewards/margins": 0.3634513318538666, "rewards/rejected": -0.5666615962982178, "step": 57 }, { "epoch": 0.1242635243706481, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.647643500777985, "learning_rate": 6.256054787539818e-07, "logits/chosen": 1.005966067314148, "logits/rejected": 0.8792574405670166, "logps/accuracies": 0.25, "logps/chosen": -421.94464111328125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -406.43414306640625, "logps/ref_rejected": -377.0458068847656, "logps/rejected": -405.022705078125, "loss": 0.4958, "rewards/accuracies": 0.75, "rewards/chosen": -0.7755249738693237, "rewards/grad_term": 0.017923269420862198, "rewards/margins": 0.623319149017334, "rewards/rejected": -1.3988441228866577, "step": 58 }, { "epoch": 0.12640599892876273, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.581762363344323, "learning_rate": 6.282392729330889e-07, "logits/chosen": 0.788644552230835, "logits/rejected": 0.8738614916801453, "logps/accuracies": 0.5, "logps/chosen": -299.53521728515625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -293.8477783203125, "logps/ref_rejected": -257.6317443847656, "logps/rejected": -273.8648681640625, "loss": 0.4967, "rewards/accuracies": 0.75, "rewards/chosen": -0.284370094537735, "rewards/grad_term": 0.019201159477233887, "rewards/margins": 0.5272856950759888, "rewards/rejected": -0.8116558194160461, "step": 59 }, { "epoch": 0.12854847348687734, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.152545681984156, "learning_rate": 6.308287995619528e-07, "logits/chosen": 1.099388837814331, "logits/rejected": 0.9928939342498779, "logps/accuracies": 0.75, "logps/chosen": -435.457763671875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -415.12860107421875, "logps/ref_rejected": -380.19720458984375, "logps/rejected": -426.2452392578125, "loss": 0.5009, "rewards/accuracies": 1.0, "rewards/chosen": -1.0164591073989868, "rewards/grad_term": 0.011328812688589096, "rewards/margins": 1.2859418392181396, "rewards/rejected": -2.302400827407837, "step": 60 }, { "epoch": 0.13069094804499196, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.54221239337406, "learning_rate": 6.33375522132322e-07, "logits/chosen": 0.9639657735824585, "logits/rejected": 0.8224814534187317, "logps/accuracies": 0.25, "logps/chosen": -348.4953918457031, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -334.9384765625, "logps/ref_rejected": -316.9508056640625, "logps/rejected": -356.79949951171875, "loss": 0.4842, "rewards/accuracies": 1.0, "rewards/chosen": -0.6778436899185181, "rewards/grad_term": 0.011656711809337139, "rewards/margins": 1.314591884613037, "rewards/rejected": -1.9924354553222656, "step": 61 }, { "epoch": 0.1328334226031066, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 9.022752821926046, "learning_rate": 6.358808327396516e-07, "logits/chosen": 1.0160002708435059, "logits/rejected": 0.616927981376648, "logps/accuracies": 0.0, "logps/chosen": -329.97100830078125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -312.87115478515625, "logps/ref_rejected": -194.55523681640625, "logps/rejected": -234.106201171875, "loss": 0.4768, "rewards/accuracies": 0.75, "rewards/chosen": -0.8549936413764954, "rewards/grad_term": 0.013443742878735065, "rewards/margins": 1.1225550174713135, "rewards/rejected": -1.9775487184524536, "step": 62 }, { "epoch": 0.13497589716122121, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.278955604267708, "learning_rate": 6.383460566529704e-07, "logits/chosen": 1.092294454574585, "logits/rejected": 0.9165109395980835, "logps/accuracies": 0.25, "logps/chosen": -447.57257080078125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -433.4320983886719, "logps/ref_rejected": -357.5963439941406, "logps/rejected": -389.59075927734375, "loss": 0.467, "rewards/accuracies": 1.0, "rewards/chosen": -0.7070247530937195, "rewards/grad_term": 0.014869745820760727, "rewards/margins": 0.8926937580108643, "rewards/rejected": -1.5997185707092285, "step": 63 }, { "epoch": 0.13711837171933583, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.629680006671775, "learning_rate": 6.407724565248689e-07, "logits/chosen": 1.0533897876739502, "logits/rejected": 0.7830870151519775, "logps/accuracies": 0.5, "logps/chosen": -331.41888427734375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -315.2283020019531, "logps/ref_rejected": -263.9187316894531, "logps/rejected": -279.4315185546875, "loss": 0.4427, "rewards/accuracies": 0.75, "rewards/chosen": -0.8095288276672363, "rewards/grad_term": 0.025076739490032196, "rewards/margins": -0.03389042615890503, "rewards/rejected": -0.7756383419036865, "step": 64 }, { "epoch": 0.13711837171933583, "eval_flips/correct->correct": 0.1599999964237213, "eval_flips/correct->incorrect": 0.0, "eval_flips/incorrect->correct": 0.019999999552965164, "eval_flips/incorrect->incorrect": 0.8199999928474426, "eval_logits/chosen": 0.9344247579574585, "eval_logits/rejected": 0.7796935439109802, "eval_logps/accuracies": 0.18000000715255737, "eval_logps/chosen": -337.09112548828125, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -284.0068664550781, "eval_loss": 0.4775756597518921, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -0.6787735819816589, "eval_rewards/grad_term": 0.018939374014735222, "eval_rewards/margins": 0.5865211486816406, "eval_rewards/rejected": -1.2652947902679443, "eval_runtime": 374.3115, "eval_samples_per_second": 4.221, "eval_steps_per_second": 0.134, "step": 64 }, { "epoch": 0.13926084627745045, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.614066894777759, "learning_rate": 6.431612362750908e-07, "logits/chosen": 0.8821598887443542, "logits/rejected": 0.7168709635734558, "logps/accuracies": 0.25, "logps/chosen": -355.70806884765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -342.8082275390625, "logps/ref_rejected": -322.1506652832031, "logps/rejected": -350.2712097167969, "loss": 0.4728, "rewards/accuracies": 1.0, "rewards/chosen": -0.6449924111366272, "rewards/grad_term": 0.016398221254348755, "rewards/margins": 0.7610346674919128, "rewards/rejected": -1.40602707862854, "step": 65 }, { "epoch": 0.1414033208355651, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.855172230097049, "learning_rate": 6.455135446776313e-07, "logits/chosen": 0.7938796281814575, "logits/rejected": 0.872632622718811, "logps/accuracies": 0.5, "logps/chosen": -310.7122802734375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -288.7535400390625, "logps/ref_rejected": -302.68560791015625, "logps/rejected": -338.1268615722656, "loss": 0.4281, "rewards/accuracies": 1.0, "rewards/chosen": -1.097936749458313, "rewards/grad_term": 0.017016390338540077, "rewards/margins": 0.6741248369216919, "rewards/rejected": -1.7720615863800049, "step": 66 }, { "epoch": 0.1435457953936797, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 9.759871054620625, "learning_rate": 6.478304786780968e-07, "logits/chosen": 0.6220331192016602, "logits/rejected": 0.5919966697692871, "logps/accuracies": 0.0, "logps/chosen": -368.77520751953125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -351.3057556152344, "logps/ref_rejected": -261.40069580078125, "logps/rejected": -304.46539306640625, "loss": 0.475, "rewards/accuracies": 0.75, "rewards/chosen": -0.8734725713729858, "rewards/grad_term": 0.014455066993832588, "rewards/margins": 1.2797632217407227, "rewards/rejected": -2.153235912322998, "step": 67 }, { "epoch": 0.14568826995179432, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.285309028552831, "learning_rate": 6.501130864653065e-07, "logits/chosen": 1.0729789733886719, "logits/rejected": 1.0459753274917603, "logps/accuracies": 0.5, "logps/chosen": -303.7723388671875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -290.2609558105469, "logps/ref_rejected": -262.08392333984375, "logps/rejected": -290.4940185546875, "loss": 0.4704, "rewards/accuracies": 0.75, "rewards/chosen": -0.6755678057670593, "rewards/grad_term": 0.016569742932915688, "rewards/margins": 0.7449362277984619, "rewards/rejected": -1.420504093170166, "step": 68 }, { "epoch": 0.14783074450990893, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.513566366172148, "learning_rate": 6.523623703186648e-07, "logits/chosen": 0.894729495048523, "logits/rejected": 1.0188794136047363, "logps/accuracies": 0.75, "logps/chosen": -317.3673095703125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -297.9476013183594, "logps/ref_rejected": -303.1460876464844, "logps/rejected": -348.55316162109375, "loss": 0.4135, "rewards/accuracies": 1.0, "rewards/chosen": -0.9709864258766174, "rewards/grad_term": 0.013456292450428009, "rewards/margins": 1.2993673086166382, "rewards/rejected": -2.2703537940979004, "step": 69 }, { "epoch": 0.14997321906802358, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.31200023242295, "learning_rate": 6.545792892506645e-07, "logits/chosen": 0.7371494174003601, "logits/rejected": 0.9037774801254272, "logps/accuracies": 0.5, "logps/chosen": -312.41497802734375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -298.4285888671875, "logps/ref_rejected": -268.10101318359375, "logps/rejected": -295.3829345703125, "loss": 0.3791, "rewards/accuracies": 0.5, "rewards/chosen": -0.6993191242218018, "rewards/grad_term": 0.017996307462453842, "rewards/margins": 0.6647781729698181, "rewards/rejected": -1.3640973567962646, "step": 70 }, { "epoch": 0.1521156936261382, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.814713554670275, "learning_rate": 6.567647614619587e-07, "logits/chosen": 0.8867220878601074, "logits/rejected": 0.8295634984970093, "logps/accuracies": 0.5, "logps/chosen": -343.9578552246094, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -321.64984130859375, "logps/ref_rejected": -283.7974548339844, "logps/rejected": -332.0108337402344, "loss": 0.4731, "rewards/accuracies": 1.0, "rewards/chosen": -1.1153992414474487, "rewards/grad_term": 0.012294553220272064, "rewards/margins": 1.2952699661254883, "rewards/rejected": -2.4106695652008057, "step": 71 }, { "epoch": 0.1542581681842528, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.137092152998001, "learning_rate": 6.589196666247328e-07, "logits/chosen": 0.9599927663803101, "logits/rejected": 0.6932302713394165, "logps/accuracies": 0.25, "logps/chosen": -317.6519775390625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -303.6397705078125, "logps/ref_rejected": -240.10572814941406, "logps/rejected": -283.4850158691406, "loss": 0.4246, "rewards/accuracies": 1.0, "rewards/chosen": -0.700611412525177, "rewards/grad_term": 0.012517021037638187, "rewards/margins": 1.4683525562286377, "rewards/rejected": -2.16896390914917, "step": 72 }, { "epoch": 0.15640064274236742, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.33751492174849, "learning_rate": 6.610448480085853e-07, "logits/chosen": 0.4570969045162201, "logits/rejected": 0.4160915017127991, "logps/accuracies": 0.25, "logps/chosen": -212.59051513671875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -212.9862060546875, "logps/ref_rejected": -181.0455322265625, "logps/rejected": -190.78341674804688, "loss": 0.4513, "rewards/accuracies": 0.75, "rewards/chosen": 0.01978490501642227, "rewards/grad_term": 0.019215064123272896, "rewards/margins": 0.5066791772842407, "rewards/rejected": -0.48689424991607666, "step": 73 }, { "epoch": 0.15854311730048207, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.366156800768572, "learning_rate": 6.631411144617796e-07, "logits/chosen": 0.7912124395370483, "logits/rejected": 0.7016277313232422, "logps/accuracies": 0.5, "logps/chosen": -337.2362365722656, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -318.22076416015625, "logps/ref_rejected": -285.63134765625, "logps/rejected": -356.1361999511719, "loss": 0.3717, "rewards/accuracies": 0.75, "rewards/chosen": -0.9507730603218079, "rewards/grad_term": 0.008385634049773216, "rewards/margins": 2.574469804763794, "rewards/rejected": -3.525242805480957, "step": 74 }, { "epoch": 0.16068559185859668, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.408007517521634, "learning_rate": 6.652092422595104e-07, "logits/chosen": 0.8023636937141418, "logits/rejected": 0.7735366821289062, "logps/accuracies": 0.75, "logps/chosen": -306.0715026855469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -275.71649169921875, "logps/ref_rejected": -242.64198303222656, "logps/rejected": -290.594970703125, "loss": 0.3573, "rewards/accuracies": 0.5, "rewards/chosen": -1.5177501440048218, "rewards/grad_term": 0.018539991229772568, "rewards/margins": 0.8798991441726685, "rewards/rejected": -2.3976492881774902, "step": 75 }, { "epoch": 0.1628280664167113, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.365050151926509, "learning_rate": 6.672499768297604e-07, "logits/chosen": 0.8398016691207886, "logits/rejected": 0.8039026260375977, "logps/accuracies": 0.25, "logps/chosen": -338.2458801269531, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -300.6439208984375, "logps/ref_rejected": -266.9490661621094, "logps/rejected": -321.94207763671875, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": -1.8800976276397705, "rewards/grad_term": 0.016930393874645233, "rewards/margins": 0.8695545196533203, "rewards/rejected": -2.749652147293091, "step": 76 }, { "epoch": 0.1649705409748259, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.75229443985931, "learning_rate": 6.692640343663431e-07, "logits/chosen": 0.8696576356887817, "logits/rejected": 0.876376211643219, "logps/accuracies": 0.5, "logps/chosen": -341.6177978515625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -313.14349365234375, "logps/ref_rejected": -307.5328369140625, "logps/rejected": -359.84185791015625, "loss": 0.3678, "rewards/accuracies": 0.75, "rewards/chosen": -1.4237148761749268, "rewards/grad_term": 0.014790714718401432, "rewards/margins": 1.1917363405227661, "rewards/rejected": -2.6154510974884033, "step": 77 }, { "epoch": 0.16711301553294056, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 8.819014293843647, "learning_rate": 6.712521033378708e-07, "logits/chosen": 1.0659058094024658, "logits/rejected": 0.6623323559761047, "logps/accuracies": 0.0, "logps/chosen": -331.59368896484375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -292.36590576171875, "logps/ref_rejected": -146.4788360595703, "logps/rejected": -199.08897399902344, "loss": 0.4175, "rewards/accuracies": 0.5, "rewards/chosen": -1.9613897800445557, "rewards/grad_term": 0.020100167021155357, "rewards/margins": 0.6691172122955322, "rewards/rejected": -2.630506992340088, "step": 78 }, { "epoch": 0.16925549009105517, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.100776831917964, "learning_rate": 6.732148459006032e-07, "logits/chosen": 0.7964029312133789, "logits/rejected": 0.6478776931762695, "logps/accuracies": 0.25, "logps/chosen": -320.44476318359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -274.2988586425781, "logps/ref_rejected": -246.58741760253906, "logps/rejected": -303.3105773925781, "loss": 0.394, "rewards/accuracies": 0.5, "rewards/chosen": -2.3072948455810547, "rewards/grad_term": 0.022354397922754288, "rewards/margins": 0.5288637280464172, "rewards/rejected": -2.8361587524414062, "step": 79 }, { "epoch": 0.1713979646491698, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.215506702371561, "learning_rate": 6.751528992224267e-07, "logits/chosen": 1.0231748819351196, "logits/rejected": 0.9256105422973633, "logps/accuracies": 0.5, "logps/chosen": -371.5207824707031, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -323.4173583984375, "logps/ref_rejected": -287.026611328125, "logps/rejected": -385.26141357421875, "loss": 0.358, "rewards/accuracies": 1.0, "rewards/chosen": -2.4051713943481445, "rewards/grad_term": 0.008784506469964981, "rewards/margins": 2.5065674781799316, "rewards/rejected": -4.911738872528076, "step": 80 }, { "epoch": 0.1735404392072844, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.286461530064006, "learning_rate": 6.770668767245965e-07, "logits/chosen": 0.9782469868659973, "logits/rejected": 0.6938868761062622, "logps/accuracies": 0.5, "logps/chosen": -289.8148498535156, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -267.3828125, "logps/ref_rejected": -228.83135986328125, "logps/rejected": -291.4294128417969, "loss": 0.3489, "rewards/accuracies": 0.75, "rewards/chosen": -1.1216013431549072, "rewards/grad_term": 0.012826542370021343, "rewards/margins": 2.0083022117614746, "rewards/rejected": -3.129903554916382, "step": 81 }, { "epoch": 0.17568291376539905, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.751825989493291, "learning_rate": 6.789573692472892e-07, "logits/chosen": 0.8200634717941284, "logits/rejected": 0.9688655138015747, "logps/accuracies": 0.5, "logps/chosen": -390.98150634765625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -348.5020751953125, "logps/ref_rejected": -350.07159423828125, "logps/rejected": -411.6036682128906, "loss": 0.393, "rewards/accuracies": 1.0, "rewards/chosen": -2.123972177505493, "rewards/grad_term": 0.014323998242616653, "rewards/margins": 0.9526323080062866, "rewards/rejected": -3.0766046047210693, "step": 82 }, { "epoch": 0.17782538832351366, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.216588835288912, "learning_rate": 6.808249461445122e-07, "logits/chosen": 1.072934627532959, "logits/rejected": 0.911789059638977, "logps/accuracies": 0.75, "logps/chosen": -479.0382080078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -419.0920104980469, "logps/ref_rejected": -390.1745300292969, "logps/rejected": -485.1689453125, "loss": 0.3724, "rewards/accuracies": 1.0, "rewards/chosen": -2.997309684753418, "rewards/grad_term": 0.0095378328114748, "rewards/margins": 1.7524113655090332, "rewards/rejected": -4.749721050262451, "step": 83 }, { "epoch": 0.17996786288162828, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.25160372674251, "learning_rate": 6.826701563134442e-07, "logits/chosen": 0.8092617988586426, "logits/rejected": 0.7646486163139343, "logps/accuracies": 0.25, "logps/chosen": -410.1537780761719, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -384.58575439453125, "logps/ref_rejected": -304.9964904785156, "logps/rejected": -354.8568115234375, "loss": 0.3661, "rewards/accuracies": 1.0, "rewards/chosen": -1.2784030437469482, "rewards/grad_term": 0.012526333332061768, "rewards/margins": 1.2146127223968506, "rewards/rejected": -2.493015766143799, "step": 84 }, { "epoch": 0.1821103374397429, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.118700174990803, "learning_rate": 6.844935291628642e-07, "logits/chosen": 1.0619235038757324, "logits/rejected": 0.9984962344169617, "logps/accuracies": 0.25, "logps/chosen": -409.0619201660156, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -377.1236267089844, "logps/ref_rejected": -326.3685302734375, "logps/rejected": -405.38037109375, "loss": 0.3594, "rewards/accuracies": 1.0, "rewards/chosen": -1.596914291381836, "rewards/grad_term": 0.009607160463929176, "rewards/margins": 2.353677988052368, "rewards/rejected": -3.950592041015625, "step": 85 }, { "epoch": 0.18425281199785754, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.571514403852135, "learning_rate": 6.862955755249413e-07, "logits/chosen": 0.9191405773162842, "logits/rejected": 0.801410436630249, "logps/accuracies": 0.5, "logps/chosen": -312.10894775390625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -278.7945556640625, "logps/ref_rejected": -234.3353271484375, "logps/rejected": -277.7581787109375, "loss": 0.3311, "rewards/accuracies": 1.0, "rewards/chosen": -1.6657218933105469, "rewards/grad_term": 0.019233139231801033, "rewards/margins": 0.5054203271865845, "rewards/rejected": -2.171142339706421, "step": 86 }, { "epoch": 0.18639528655597215, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.757747519851693, "learning_rate": 6.880767885143194e-07, "logits/chosen": 0.8738288283348083, "logits/rejected": 0.684519350528717, "logps/accuracies": 0.25, "logps/chosen": -410.3459167480469, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -376.2248840332031, "logps/ref_rejected": -312.1935729980469, "logps/rejected": -370.3998107910156, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7060527801513672, "rewards/grad_term": 0.01283444557338953, "rewards/margins": 1.2042596340179443, "rewards/rejected": -2.9103124141693115, "step": 87 }, { "epoch": 0.18853776111408677, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.985961972583743, "learning_rate": 6.898376443381053e-07, "logits/chosen": 0.9711716771125793, "logits/rejected": 0.8526138067245483, "logps/accuracies": 0.25, "logps/chosen": -403.53948974609375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -360.1213073730469, "logps/ref_rejected": -311.68292236328125, "logps/rejected": -381.7286376953125, "loss": 0.3717, "rewards/accuracies": 0.5, "rewards/chosen": -2.170908212661743, "rewards/grad_term": 0.018295947462320328, "rewards/margins": 1.3313778638839722, "rewards/rejected": -3.502286195755005, "step": 88 }, { "epoch": 0.19068023567220138, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.683570806703395, "learning_rate": 6.915786030600927e-07, "logits/chosen": 0.843113899230957, "logits/rejected": 0.7101360559463501, "logps/accuracies": 0.5, "logps/chosen": -428.8794250488281, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -389.9796447753906, "logps/ref_rejected": -322.93255615234375, "logps/rejected": -415.9242248535156, "loss": 0.3549, "rewards/accuracies": 1.0, "rewards/chosen": -1.9449876546859741, "rewards/grad_term": 0.004990034736692905, "rewards/margins": 2.704596996307373, "rewards/rejected": -4.649584770202637, "step": 89 }, { "epoch": 0.19282271023031602, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.498185901157015, "learning_rate": 6.933001093222904e-07, "logits/chosen": 0.9054147601127625, "logits/rejected": 0.7073743939399719, "logps/accuracies": 0.75, "logps/chosen": -294.0045471191406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -280.5016174316406, "logps/ref_rejected": -278.77313232421875, "logps/rejected": -316.6849060058594, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": -0.6751471161842346, "rewards/grad_term": 0.011688041500747204, "rewards/margins": 1.2204415798187256, "rewards/rejected": -1.895588755607605, "step": 90 }, { "epoch": 0.19496518478843064, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.490840097910208, "learning_rate": 6.950025930265823e-07, "logits/chosen": 0.8930804133415222, "logits/rejected": 0.9076898097991943, "logps/accuracies": 0.75, "logps/chosen": -353.33563232421875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -318.02215576171875, "logps/ref_rejected": -333.869140625, "logps/rejected": -411.1112365722656, "loss": 0.3822, "rewards/accuracies": 1.0, "rewards/chosen": -1.7656757831573486, "rewards/grad_term": 0.006808947771787643, "rewards/margins": 2.0964293479919434, "rewards/rejected": -3.862105369567871, "step": 91 }, { "epoch": 0.19710765934654526, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.412294871691426, "learning_rate": 6.966864699791386e-07, "logits/chosen": 0.7138292789459229, "logits/rejected": 0.6344163417816162, "logps/accuracies": 0.75, "logps/chosen": -284.339111328125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -267.2784423828125, "logps/ref_rejected": -271.889892578125, "logps/rejected": -323.4517822265625, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": -0.8530327081680298, "rewards/grad_term": 0.008454471826553345, "rewards/margins": 1.7250609397888184, "rewards/rejected": -2.5780937671661377, "step": 92 }, { "epoch": 0.19925013390465987, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 7.851393127918492, "learning_rate": 6.983521424999892e-07, "logits/chosen": 1.1121702194213867, "logits/rejected": 1.0464541912078857, "logps/accuracies": 0.0, "logps/chosen": -323.29986572265625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -300.96435546875, "logps/ref_rejected": -263.5765686035156, "logps/rejected": -319.5726318359375, "loss": 0.3308, "rewards/accuracies": 1.0, "rewards/chosen": -1.1167749166488647, "rewards/grad_term": 0.01233928557485342, "rewards/margins": 1.683027982711792, "rewards/rejected": -2.7998030185699463, "step": 93 }, { "epoch": 0.20139260846277451, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.7736296892204395, "learning_rate": 7e-07, "logits/chosen": 0.8960837125778198, "logits/rejected": 0.761021614074707, "logps/accuracies": 0.25, "logps/chosen": -377.28021240234375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -330.466064453125, "logps/ref_rejected": -286.0745849609375, "logps/rejected": -351.7135009765625, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": -2.3407070636749268, "rewards/grad_term": 0.015900662168860435, "rewards/margins": 0.9412397742271423, "rewards/rejected": -3.281947135925293, "step": 94 }, { "epoch": 0.20353508302088913, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.371159163521241, "learning_rate": 7e-07, "logits/chosen": 0.9651041030883789, "logits/rejected": 0.7860502004623413, "logps/accuracies": 0.25, "logps/chosen": -425.8303527832031, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -374.81097412109375, "logps/ref_rejected": -321.1005859375, "logps/rejected": -400.951171875, "loss": 0.3427, "rewards/accuracies": 1.0, "rewards/chosen": -2.5509684085845947, "rewards/grad_term": 0.0106906583532691, "rewards/margins": 1.4415616989135742, "rewards/rejected": -3.99252986907959, "step": 95 }, { "epoch": 0.20567755757900374, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.177619063233454, "learning_rate": 6.991646778042959e-07, "logits/chosen": 0.7661604881286621, "logits/rejected": 0.6532018780708313, "logps/accuracies": 0.5, "logps/chosen": -257.8993835449219, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -240.183349609375, "logps/ref_rejected": -198.9567413330078, "logps/rejected": -253.74404907226562, "loss": 0.3882, "rewards/accuracies": 1.0, "rewards/chosen": -0.8858024477958679, "rewards/grad_term": 0.011164636351168156, "rewards/margins": 1.8535633087158203, "rewards/rejected": -2.739365816116333, "step": 96 }, { "epoch": 0.20782003213711836, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.250952702236187, "learning_rate": 6.983293556085918e-07, "logits/chosen": 0.8017215728759766, "logits/rejected": 0.6558822393417358, "logps/accuracies": 0.5, "logps/chosen": -316.91131591796875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -272.19842529296875, "logps/ref_rejected": -228.5543975830078, "logps/rejected": -309.4710693359375, "loss": 0.336, "rewards/accuracies": 0.75, "rewards/chosen": -2.235644817352295, "rewards/grad_term": 0.010754971764981747, "rewards/margins": 1.810187816619873, "rewards/rejected": -4.045832633972168, "step": 97 }, { "epoch": 0.209962506695233, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.58529126245538, "learning_rate": 6.974940334128877e-07, "logits/chosen": 0.8682336807250977, "logits/rejected": 0.7378227710723877, "logps/accuracies": 0.5, "logps/chosen": -253.228515625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -229.12966918945312, "logps/ref_rejected": -178.55392456054688, "logps/rejected": -221.88421630859375, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": -1.2049428224563599, "rewards/grad_term": 0.014865662902593613, "rewards/margins": 0.9615722894668579, "rewards/rejected": -2.1665151119232178, "step": 98 }, { "epoch": 0.21210498125334762, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.572183229995163, "learning_rate": 6.966587112171838e-07, "logits/chosen": 0.8709318041801453, "logits/rejected": 0.7755447626113892, "logps/accuracies": 0.5, "logps/chosen": -408.872802734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -363.9078369140625, "logps/ref_rejected": -299.89794921875, "logps/rejected": -368.15155029296875, "loss": 0.3652, "rewards/accuracies": 0.5, "rewards/chosen": -2.2482473850250244, "rewards/grad_term": 0.018564339727163315, "rewards/margins": 1.1644330024719238, "rewards/rejected": -3.4126803874969482, "step": 99 }, { "epoch": 0.21424745581146223, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.624545109237621, "learning_rate": 6.958233890214797e-07, "logits/chosen": 0.9372926354408264, "logits/rejected": 0.8751659989356995, "logps/accuracies": 0.5, "logps/chosen": -474.0740966796875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -424.33819580078125, "logps/ref_rejected": -390.84747314453125, "logps/rejected": -506.8907165527344, "loss": 0.3056, "rewards/accuracies": 1.0, "rewards/chosen": -2.4867947101593018, "rewards/grad_term": 0.004845261108130217, "rewards/margins": 3.3153672218322754, "rewards/rejected": -5.802162170410156, "step": 100 }, { "epoch": 0.21638993036957685, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.067295914483129, "learning_rate": 6.949880668257756e-07, "logits/chosen": 0.9544820785522461, "logits/rejected": 0.9464821815490723, "logps/accuracies": 0.5, "logps/chosen": -494.697509765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -454.8968505859375, "logps/ref_rejected": -400.18536376953125, "logps/rejected": -466.2589111328125, "loss": 0.358, "rewards/accuracies": 1.0, "rewards/chosen": -1.990033745765686, "rewards/grad_term": 0.013565966859459877, "rewards/margins": 1.3136428594589233, "rewards/rejected": -3.3036766052246094, "step": 101 }, { "epoch": 0.2185324049276915, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.960375031224479, "learning_rate": 6.941527446300716e-07, "logits/chosen": 0.8813360333442688, "logits/rejected": 0.695158839225769, "logps/accuracies": 0.5, "logps/chosen": -291.07232666015625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -251.7209930419922, "logps/ref_rejected": -190.36097717285156, "logps/rejected": -257.9454345703125, "loss": 0.3402, "rewards/accuracies": 1.0, "rewards/chosen": -1.9675672054290771, "rewards/grad_term": 0.011471440084278584, "rewards/margins": 1.411657452583313, "rewards/rejected": -3.3792247772216797, "step": 102 }, { "epoch": 0.2206748794858061, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.26805795080682, "learning_rate": 6.933174224343675e-07, "logits/chosen": 0.8939443826675415, "logits/rejected": 0.625817596912384, "logps/accuracies": 0.75, "logps/chosen": -145.8295440673828, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -133.74105834960938, "logps/ref_rejected": -113.23294067382812, "logps/rejected": -148.43218994140625, "loss": 0.3759, "rewards/accuracies": 1.0, "rewards/chosen": -0.6044239401817322, "rewards/grad_term": 0.014196785166859627, "rewards/margins": 1.1555386781692505, "rewards/rejected": -1.7599626779556274, "step": 103 }, { "epoch": 0.22281735404392072, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.93695871637079, "learning_rate": 6.924821002386635e-07, "logits/chosen": 1.0364937782287598, "logits/rejected": 0.563789427280426, "logps/accuracies": 0.5, "logps/chosen": -199.2762451171875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -178.82568359375, "logps/ref_rejected": -115.96835327148438, "logps/rejected": -148.9056396484375, "loss": 0.3053, "rewards/accuracies": 0.75, "rewards/chosen": -1.0225275754928589, "rewards/grad_term": 0.019687440246343613, "rewards/margins": 0.6243367195129395, "rewards/rejected": -1.6468642950057983, "step": 104 }, { "epoch": 0.22495982860203534, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.863344087781872, "learning_rate": 6.916467780429593e-07, "logits/chosen": 0.869300901889801, "logits/rejected": 0.7735204100608826, "logps/accuracies": 0.25, "logps/chosen": -459.5062561035156, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -410.751708984375, "logps/ref_rejected": -351.19830322265625, "logps/rejected": -436.1498718261719, "loss": 0.3494, "rewards/accuracies": 1.0, "rewards/chosen": -2.4377286434173584, "rewards/grad_term": 0.008553780615329742, "rewards/margins": 1.8098492622375488, "rewards/rejected": -4.247577667236328, "step": 105 }, { "epoch": 0.22710230316014998, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.525931151845846, "learning_rate": 6.908114558472554e-07, "logits/chosen": 0.9212764501571655, "logits/rejected": 0.7540117502212524, "logps/accuracies": 0.75, "logps/chosen": -369.387939453125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -333.4745788574219, "logps/ref_rejected": -299.4930419921875, "logps/rejected": -394.82952880859375, "loss": 0.3309, "rewards/accuracies": 1.0, "rewards/chosen": -1.7956693172454834, "rewards/grad_term": 0.00758711900562048, "rewards/margins": 2.9711527824401855, "rewards/rejected": -4.76682186126709, "step": 106 }, { "epoch": 0.2292447777182646, "flips/correct->correct": 1.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.335424081368634, "learning_rate": 6.899761336515513e-07, "logits/chosen": 0.7336133718490601, "logits/rejected": 0.8765916228294373, "logps/accuracies": 1.0, "logps/chosen": -244.1851348876953, "logps/ref_accuracies": 1.0, "logps/ref_chosen": -227.0002899169922, "logps/ref_rejected": -286.24835205078125, "logps/rejected": -336.29425048828125, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/chosen": -0.8592423796653748, "rewards/grad_term": 0.009657299146056175, "rewards/margins": 1.643053412437439, "rewards/rejected": -2.502295970916748, "step": 107 }, { "epoch": 0.2313872522763792, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.973679697099532, "learning_rate": 6.891408114558472e-07, "logits/chosen": 0.9743479490280151, "logits/rejected": 1.0019692182540894, "logps/accuracies": 0.75, "logps/chosen": -339.78680419921875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -297.22393798828125, "logps/ref_rejected": -304.42578125, "logps/rejected": -374.98779296875, "loss": 0.2818, "rewards/accuracies": 1.0, "rewards/chosen": -2.1281425952911377, "rewards/grad_term": 0.01099520642310381, "rewards/margins": 1.3999576568603516, "rewards/rejected": -3.5281002521514893, "step": 108 }, { "epoch": 0.23352972683449383, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.989727540849666, "learning_rate": 6.883054892601431e-07, "logits/chosen": 1.1120461225509644, "logits/rejected": 0.7604467272758484, "logps/accuracies": 0.5, "logps/chosen": -414.3526916503906, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -393.2984619140625, "logps/ref_rejected": -310.2208251953125, "logps/rejected": -371.7498779296875, "loss": 0.3362, "rewards/accuracies": 1.0, "rewards/chosen": -1.052709937095642, "rewards/grad_term": 0.010077744722366333, "rewards/margins": 2.023743152618408, "rewards/rejected": -3.07645320892334, "step": 109 }, { "epoch": 0.23567220139260847, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.51051941007351, "learning_rate": 6.874701670644392e-07, "logits/chosen": 0.8338203430175781, "logits/rejected": 0.677147388458252, "logps/accuracies": 0.5, "logps/chosen": -344.638671875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -336.9551086425781, "logps/ref_rejected": -251.71192932128906, "logps/rejected": -278.19818115234375, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": -0.3841773271560669, "rewards/grad_term": 0.01569775864481926, "rewards/margins": 0.9401355981826782, "rewards/rejected": -1.3243129253387451, "step": 110 }, { "epoch": 0.2378146759507231, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.352097570099665, "learning_rate": 6.866348448687351e-07, "logits/chosen": 1.0819525718688965, "logits/rejected": 0.9638932943344116, "logps/accuracies": 0.5, "logps/chosen": -340.05609130859375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -319.25897216796875, "logps/ref_rejected": -289.39013671875, "logps/rejected": -384.5931091308594, "loss": 0.2425, "rewards/accuracies": 1.0, "rewards/chosen": -1.0398552417755127, "rewards/grad_term": 0.007856637239456177, "rewards/margins": 3.7202935218811035, "rewards/rejected": -4.760149002075195, "step": 111 }, { "epoch": 0.2399571505088377, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.759463222456692, "learning_rate": 6.85799522673031e-07, "logits/chosen": 0.6596381664276123, "logits/rejected": 0.4419824481010437, "logps/accuracies": 0.25, "logps/chosen": -287.6408996582031, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -243.9873046875, "logps/ref_rejected": -173.07931518554688, "logps/rejected": -247.95965576171875, "loss": 0.3367, "rewards/accuracies": 1.0, "rewards/chosen": -2.1826789379119873, "rewards/grad_term": 0.011957314796745777, "rewards/margins": 1.5613383054733276, "rewards/rejected": -3.7440171241760254, "step": 112 }, { "epoch": 0.24209962506695232, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.026439048323674, "learning_rate": 6.849642004773269e-07, "logits/chosen": 0.9305301904678345, "logits/rejected": 0.8421428203582764, "logps/accuracies": 0.25, "logps/chosen": -441.80340576171875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -359.161865234375, "logps/ref_rejected": -317.730712890625, "logps/rejected": -430.5499267578125, "loss": 0.2678, "rewards/accuracies": 0.5, "rewards/chosen": -4.132076263427734, "rewards/grad_term": 0.01556328870356083, "rewards/margins": 1.508885145187378, "rewards/rejected": -5.640961647033691, "step": 113 }, { "epoch": 0.24424209962506696, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.200400573622296, "learning_rate": 6.841288782816229e-07, "logits/chosen": 0.957403838634491, "logits/rejected": 0.7602821588516235, "logps/accuracies": 0.25, "logps/chosen": -317.9858703613281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -303.8868713378906, "logps/ref_rejected": -208.02256774902344, "logps/rejected": -256.6693115234375, "loss": 0.283, "rewards/accuracies": 1.0, "rewards/chosen": -0.7049498558044434, "rewards/grad_term": 0.009732572361826897, "rewards/margins": 1.7273874282836914, "rewards/rejected": -2.4323372840881348, "step": 114 }, { "epoch": 0.24638457418318158, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.032492821291632, "learning_rate": 6.832935560859188e-07, "logits/chosen": 0.8847552537918091, "logits/rejected": 0.8709891438484192, "logps/accuracies": 0.75, "logps/chosen": -275.5125427246094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -239.18618774414062, "logps/ref_rejected": -209.97958374023438, "logps/rejected": -299.82904052734375, "loss": 0.335, "rewards/accuracies": 1.0, "rewards/chosen": -1.816316843032837, "rewards/grad_term": 0.007570344489067793, "rewards/margins": 2.6761562824249268, "rewards/rejected": -4.492473125457764, "step": 115 }, { "epoch": 0.2485270487412962, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.054789967348844, "learning_rate": 6.824582338902147e-07, "logits/chosen": 0.9024197459220886, "logits/rejected": 0.7947896718978882, "logps/accuracies": 1.0, "logps/chosen": -347.18023681640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -307.41802978515625, "logps/ref_rejected": -293.6680908203125, "logps/rejected": -407.90380859375, "loss": 0.3347, "rewards/accuracies": 1.0, "rewards/chosen": -1.9881106615066528, "rewards/grad_term": 0.0060688708908855915, "rewards/margins": 3.7236742973327637, "rewards/rejected": -5.711784839630127, "step": 116 }, { "epoch": 0.25066952329941083, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 8.668010213567692, "learning_rate": 6.816229116945108e-07, "logits/chosen": 0.9976560473442078, "logits/rejected": 0.6948249340057373, "logps/accuracies": 0.0, "logps/chosen": -401.39923095703125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -337.87274169921875, "logps/ref_rejected": -240.39515686035156, "logps/rejected": -330.98101806640625, "loss": 0.3004, "rewards/accuracies": 0.75, "rewards/chosen": -3.17632794380188, "rewards/grad_term": 0.013613393530249596, "rewards/margins": 1.3529647588729858, "rewards/rejected": -4.529292583465576, "step": 117 }, { "epoch": 0.25281199785752545, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.847556657578917, "learning_rate": 6.807875894988067e-07, "logits/chosen": 0.9075788259506226, "logits/rejected": 0.7977707982063293, "logps/accuracies": 0.75, "logps/chosen": -375.44140625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -321.0868225097656, "logps/ref_rejected": -267.4314270019531, "logps/rejected": -378.83599853515625, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": -2.7177300453186035, "rewards/grad_term": 0.004429055377840996, "rewards/margins": 2.8524982929229736, "rewards/rejected": -5.570228099822998, "step": 118 }, { "epoch": 0.25495447241564007, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 8.68803296697522, "learning_rate": 6.799522673031026e-07, "logits/chosen": 0.895818293094635, "logits/rejected": 0.575642466545105, "logps/accuracies": 0.0, "logps/chosen": -318.50885009765625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -287.44415283203125, "logps/ref_rejected": -177.8115234375, "logps/rejected": -243.67047119140625, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": -1.5532350540161133, "rewards/grad_term": 0.010060964152216911, "rewards/margins": 1.739712119102478, "rewards/rejected": -3.2929470539093018, "step": 119 }, { "epoch": 0.2570969469737547, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.719803160063796, "learning_rate": 6.791169451073985e-07, "logits/chosen": 0.7772294282913208, "logits/rejected": 0.8315998315811157, "logps/accuracies": 1.0, "logps/chosen": -432.6302185058594, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -355.44903564453125, "logps/ref_rejected": -386.0659484863281, "logps/rejected": -523.24609375, "loss": 0.2837, "rewards/accuracies": 1.0, "rewards/chosen": -3.859060287475586, "rewards/grad_term": 0.003882521763443947, "rewards/margins": 2.9999477863311768, "rewards/rejected": -6.859008312225342, "step": 120 }, { "epoch": 0.2592394215318693, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.238978607387634, "learning_rate": 6.782816229116945e-07, "logits/chosen": 0.8819637894630432, "logits/rejected": 0.7054441571235657, "logps/accuracies": 0.25, "logps/chosen": -393.9786071777344, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -364.44207763671875, "logps/ref_rejected": -345.252685546875, "logps/rejected": -389.1627502441406, "loss": 0.2768, "rewards/accuracies": 1.0, "rewards/chosen": -1.4768271446228027, "rewards/grad_term": 0.01706988736987114, "rewards/margins": 0.7186762094497681, "rewards/rejected": -2.1955032348632812, "step": 121 }, { "epoch": 0.2613818960899839, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.838179827940348, "learning_rate": 6.774463007159905e-07, "logits/chosen": 0.7459653615951538, "logits/rejected": 0.7466898560523987, "logps/accuracies": 0.5, "logps/chosen": -289.6307067871094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -244.8365020751953, "logps/ref_rejected": -223.6743927001953, "logps/rejected": -318.4463195800781, "loss": 0.273, "rewards/accuracies": 0.75, "rewards/chosen": -2.2397100925445557, "rewards/grad_term": 0.010229886509478092, "rewards/margins": 2.4988858699798584, "rewards/rejected": -4.738595962524414, "step": 122 }, { "epoch": 0.2635243706480985, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.71899475513759, "learning_rate": 6.766109785202863e-07, "logits/chosen": 0.9420760869979858, "logits/rejected": 0.8325188755989075, "logps/accuracies": 0.75, "logps/chosen": -358.3894348144531, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -302.154052734375, "logps/ref_rejected": -271.524658203125, "logps/rejected": -382.03302001953125, "loss": 0.2531, "rewards/accuracies": 0.75, "rewards/chosen": -2.811767339706421, "rewards/grad_term": 0.010155830532312393, "rewards/margins": 2.7136499881744385, "rewards/rejected": -5.525417327880859, "step": 123 }, { "epoch": 0.2656668452062132, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.827729622567765, "learning_rate": 6.757756563245823e-07, "logits/chosen": 0.9883730411529541, "logits/rejected": 0.8319672346115112, "logps/accuracies": 0.5, "logps/chosen": -461.412109375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -367.73486328125, "logps/ref_rejected": -298.51544189453125, "logps/rejected": -510.0968017578125, "loss": 0.2583, "rewards/accuracies": 1.0, "rewards/chosen": -4.683863162994385, "rewards/grad_term": 0.0023802227806299925, "rewards/margins": 5.895203590393066, "rewards/rejected": -10.57906723022461, "step": 124 }, { "epoch": 0.2678093197643278, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 5.875715435058411, "learning_rate": 6.749403341288783e-07, "logits/chosen": 0.9549310207366943, "logits/rejected": 0.8220203518867493, "logps/accuracies": 0.25, "logps/chosen": -292.9206237792969, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -250.8220672607422, "logps/ref_rejected": -188.01498413085938, "logps/rejected": -286.50128173828125, "loss": 0.2039, "rewards/accuracies": 1.0, "rewards/chosen": -2.1049275398254395, "rewards/grad_term": 0.006195048335939646, "rewards/margins": 2.8193886280059814, "rewards/rejected": -4.92431640625, "step": 125 }, { "epoch": 0.26995179432244243, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.463586322032903, "learning_rate": 6.741050119331742e-07, "logits/chosen": 0.9875746965408325, "logits/rejected": 0.7808788418769836, "logps/accuracies": 0.5, "logps/chosen": -251.4386444091797, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -224.3441619873047, "logps/ref_rejected": -220.13214111328125, "logps/rejected": -279.40972900390625, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": -1.3547240495681763, "rewards/grad_term": 0.009899970144033432, "rewards/margins": 1.6091564893722534, "rewards/rejected": -2.9638805389404297, "step": 126 }, { "epoch": 0.27209426888055704, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.995166618243903, "learning_rate": 6.732696897374701e-07, "logits/chosen": 1.0715935230255127, "logits/rejected": 0.9156344532966614, "logps/accuracies": 0.5, "logps/chosen": -495.48419189453125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -411.56317138671875, "logps/ref_rejected": -354.6976623535156, "logps/rejected": -507.9322509765625, "loss": 0.268, "rewards/accuracies": 1.0, "rewards/chosen": -4.196050643920898, "rewards/grad_term": 0.0026723581831902266, "rewards/margins": 3.465679168701172, "rewards/rejected": -7.66172981262207, "step": 127 }, { "epoch": 0.27423674343867166, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.959838246440706, "learning_rate": 6.72434367541766e-07, "logits/chosen": 0.9897805452346802, "logits/rejected": 0.8163310289382935, "logps/accuracies": 1.0, "logps/chosen": -487.4618225097656, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -402.695556640625, "logps/ref_rejected": -365.8092346191406, "logps/rejected": -549.47314453125, "loss": 0.303, "rewards/accuracies": 1.0, "rewards/chosen": -4.238313674926758, "rewards/grad_term": 0.0018401599954813719, "rewards/margins": 4.944880485534668, "rewards/rejected": -9.183194160461426, "step": 128 }, { "epoch": 0.27423674343867166, "eval_flips/correct->correct": 0.1599999964237213, "eval_flips/correct->incorrect": 0.0, "eval_flips/incorrect->correct": 0.20000000298023224, "eval_flips/incorrect->incorrect": 0.6399999856948853, "eval_logits/chosen": 0.871035099029541, "eval_logits/rejected": 0.7403469681739807, "eval_logps/accuracies": 0.36000001430511475, "eval_logps/chosen": -374.1800537109375, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -354.16534423828125, "eval_loss": 0.290331095457077, "eval_rewards/accuracies": 0.7799999713897705, "eval_rewards/chosen": -2.533219575881958, "eval_rewards/grad_term": 0.011603965424001217, "eval_rewards/margins": 2.2400009632110596, "eval_rewards/rejected": -4.773220062255859, "eval_runtime": 374.0534, "eval_samples_per_second": 4.224, "eval_steps_per_second": 0.134, "step": 128 }, { "epoch": 0.2763792179967863, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.320596690520534, "learning_rate": 6.715990453460621e-07, "logits/chosen": 0.959011435508728, "logits/rejected": 0.8785006999969482, "logps/accuracies": 1.0, "logps/chosen": -253.6005859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -211.31517028808594, "logps/ref_rejected": -201.87469482421875, "logps/rejected": -295.11090087890625, "loss": 0.2446, "rewards/accuracies": 1.0, "rewards/chosen": -2.114271402359009, "rewards/grad_term": 0.01070532575249672, "rewards/margins": 2.5475387573242188, "rewards/rejected": -4.661810874938965, "step": 129 }, { "epoch": 0.2785216925549009, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.49663463971219, "learning_rate": 6.707637231503579e-07, "logits/chosen": 0.9510787129402161, "logits/rejected": 0.6340673565864563, "logps/accuracies": 0.25, "logps/chosen": -353.7513427734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -313.7836608886719, "logps/ref_rejected": -263.53387451171875, "logps/rejected": -344.3742980957031, "loss": 0.2306, "rewards/accuracies": 1.0, "rewards/chosen": -1.9983829259872437, "rewards/grad_term": 0.007957426831126213, "rewards/margins": 2.043639659881592, "rewards/rejected": -4.042022228240967, "step": 130 }, { "epoch": 0.2806641671130155, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.934812547934818, "learning_rate": 6.699284009546539e-07, "logits/chosen": 0.7116758823394775, "logits/rejected": 0.6254409551620483, "logps/accuracies": 0.0, "logps/chosen": -361.2596740722656, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -345.98199462890625, "logps/ref_rejected": -231.908447265625, "logps/rejected": -297.0892028808594, "loss": 0.2395, "rewards/accuracies": 1.0, "rewards/chosen": -0.763884425163269, "rewards/grad_term": 0.008144252933561802, "rewards/margins": 2.4951541423797607, "rewards/rejected": -3.2590384483337402, "step": 131 }, { "epoch": 0.2828066416711302, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.274621875830415, "learning_rate": 6.690930787589498e-07, "logits/chosen": 0.9305391907691956, "logits/rejected": 0.9035634994506836, "logps/accuracies": 1.0, "logps/chosen": -454.7461242675781, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -338.783447265625, "logps/ref_rejected": -331.75201416015625, "logps/rejected": -508.1329650878906, "loss": 0.2374, "rewards/accuracies": 1.0, "rewards/chosen": -5.79813289642334, "rewards/grad_term": 0.0040429579094052315, "rewards/margins": 3.020915985107422, "rewards/rejected": -8.819048881530762, "step": 132 }, { "epoch": 0.2849491162292448, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.310190672888075, "learning_rate": 6.682577565632458e-07, "logits/chosen": 0.9652221202850342, "logits/rejected": 0.8905255198478699, "logps/accuracies": 0.75, "logps/chosen": -474.09820556640625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -404.9132080078125, "logps/ref_rejected": -397.7674560546875, "logps/rejected": -542.0930786132812, "loss": 0.2522, "rewards/accuracies": 1.0, "rewards/chosen": -3.459249496459961, "rewards/grad_term": 0.007244814652949572, "rewards/margins": 3.7570319175720215, "rewards/rejected": -7.216281414031982, "step": 133 }, { "epoch": 0.2870915907873594, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.377477290301309, "learning_rate": 6.674224343675417e-07, "logits/chosen": 0.9276644587516785, "logits/rejected": 0.7834001779556274, "logps/accuracies": 0.75, "logps/chosen": -482.92828369140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -378.0372009277344, "logps/ref_rejected": -330.1478271484375, "logps/rejected": -559.7905883789062, "loss": 0.2727, "rewards/accuracies": 1.0, "rewards/chosen": -5.244553089141846, "rewards/grad_term": 0.001372232916764915, "rewards/margins": 6.237585544586182, "rewards/rejected": -11.482138633728027, "step": 134 }, { "epoch": 0.289234065345474, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.304875956111065, "learning_rate": 6.665871121718377e-07, "logits/chosen": 0.8576828837394714, "logits/rejected": 0.6341058611869812, "logps/accuracies": 0.5, "logps/chosen": -337.80535888671875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -294.550537109375, "logps/ref_rejected": -220.64215087890625, "logps/rejected": -329.065673828125, "loss": 0.2647, "rewards/accuracies": 1.0, "rewards/chosen": -2.162740707397461, "rewards/grad_term": 0.009863924235105515, "rewards/margins": 3.2584362030029297, "rewards/rejected": -5.421176910400391, "step": 135 }, { "epoch": 0.29137653990358864, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.625140040989452, "learning_rate": 6.657517899761337e-07, "logits/chosen": 0.8870478272438049, "logits/rejected": 0.7576145529747009, "logps/accuracies": 0.5, "logps/chosen": -305.3637390136719, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -262.2385559082031, "logps/ref_rejected": -227.50672912597656, "logps/rejected": -304.53564453125, "loss": 0.2461, "rewards/accuracies": 1.0, "rewards/chosen": -2.1562600135803223, "rewards/grad_term": 0.010284369811415672, "rewards/margins": 1.6951854228973389, "rewards/rejected": -3.8514456748962402, "step": 136 }, { "epoch": 0.29351901446170325, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.018484141515877, "learning_rate": 6.649164677804296e-07, "logits/chosen": 0.9112040400505066, "logits/rejected": 0.7212855815887451, "logps/accuracies": 0.25, "logps/chosen": -343.9112243652344, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -280.6199645996094, "logps/ref_rejected": -210.87319946289062, "logps/rejected": -326.63885498046875, "loss": 0.2715, "rewards/accuracies": 1.0, "rewards/chosen": -3.164562225341797, "rewards/grad_term": 0.009311579167842865, "rewards/margins": 2.62372088432312, "rewards/rejected": -5.788283348083496, "step": 137 }, { "epoch": 0.29566148901981787, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.034703724953653, "learning_rate": 6.640811455847255e-07, "logits/chosen": 0.7124534845352173, "logits/rejected": 0.6710624694824219, "logps/accuracies": 0.5, "logps/chosen": -434.5594482421875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -374.4966125488281, "logps/ref_rejected": -310.2477722167969, "logps/rejected": -434.73895263671875, "loss": 0.2497, "rewards/accuracies": 1.0, "rewards/chosen": -3.0031418800354004, "rewards/grad_term": 0.00873873382806778, "rewards/margins": 3.221416473388672, "rewards/rejected": -6.2245588302612305, "step": 138 }, { "epoch": 0.2978039635779325, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.812429176308449, "learning_rate": 6.632458233890214e-07, "logits/chosen": 0.9117505550384521, "logits/rejected": 0.8502323031425476, "logps/accuracies": 0.75, "logps/chosen": -334.134765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -272.14581298828125, "logps/ref_rejected": -239.22955322265625, "logps/rejected": -368.5514221191406, "loss": 0.2585, "rewards/accuracies": 1.0, "rewards/chosen": -3.0994479656219482, "rewards/grad_term": 0.0053825220093131065, "rewards/margins": 3.366644859313965, "rewards/rejected": -6.466092586517334, "step": 139 }, { "epoch": 0.29994643813604716, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.660588605349608, "learning_rate": 6.624105011933175e-07, "logits/chosen": 0.6605690717697144, "logits/rejected": 0.8247154951095581, "logps/accuracies": 0.5, "logps/chosen": -329.8360900878906, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -290.978271484375, "logps/ref_rejected": -246.50843811035156, "logps/rejected": -299.4759521484375, "loss": 0.2595, "rewards/accuracies": 0.5, "rewards/chosen": -1.94289231300354, "rewards/grad_term": 0.01796804741024971, "rewards/margins": 0.705483078956604, "rewards/rejected": -2.6483755111694336, "step": 140 }, { "epoch": 0.30208891269416177, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.598864233990223, "learning_rate": 6.615751789976133e-07, "logits/chosen": 0.6485729217529297, "logits/rejected": 0.6901566982269287, "logps/accuracies": 1.0, "logps/chosen": -373.37396240234375, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -334.75689697265625, "logps/ref_rejected": -381.53717041015625, "logps/rejected": -493.22930908203125, "loss": 0.2639, "rewards/accuracies": 1.0, "rewards/chosen": -1.930854082107544, "rewards/grad_term": 0.0024079105351120234, "rewards/margins": 3.6537532806396484, "rewards/rejected": -5.584607124328613, "step": 141 }, { "epoch": 0.3042313872522764, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.28171427184855, "learning_rate": 6.607398568019093e-07, "logits/chosen": 0.9653871655464172, "logits/rejected": 0.6570479273796082, "logps/accuracies": 0.5, "logps/chosen": -340.83685302734375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -281.6165771484375, "logps/ref_rejected": -203.56463623046875, "logps/rejected": -293.328857421875, "loss": 0.2356, "rewards/accuracies": 1.0, "rewards/chosen": -2.9610140323638916, "rewards/grad_term": 0.010910983197391033, "rewards/margins": 1.5271971225738525, "rewards/rejected": -4.488211154937744, "step": 142 }, { "epoch": 0.306373861810391, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.851086611102091, "learning_rate": 6.599045346062052e-07, "logits/chosen": 0.9678068161010742, "logits/rejected": 0.8588843941688538, "logps/accuracies": 0.75, "logps/chosen": -388.9454345703125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -324.3150634765625, "logps/ref_rejected": -277.83184814453125, "logps/rejected": -417.49212646484375, "loss": 0.1983, "rewards/accuracies": 1.0, "rewards/chosen": -3.2315189838409424, "rewards/grad_term": 0.004653441719710827, "rewards/margins": 3.7514941692352295, "rewards/rejected": -6.983013153076172, "step": 143 }, { "epoch": 0.3085163363685056, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.817695728935396, "learning_rate": 6.590692124105012e-07, "logits/chosen": 0.9185338616371155, "logits/rejected": 0.8464267253875732, "logps/accuracies": 0.75, "logps/chosen": -355.24176025390625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -319.5701599121094, "logps/ref_rejected": -291.1425476074219, "logps/rejected": -393.3614196777344, "loss": 0.2415, "rewards/accuracies": 1.0, "rewards/chosen": -1.7835807800292969, "rewards/grad_term": 0.002678102580830455, "rewards/margins": 3.327362060546875, "rewards/rejected": -5.110942840576172, "step": 144 }, { "epoch": 0.31065881092662023, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 11.370643366730324, "learning_rate": 6.582338902147971e-07, "logits/chosen": 0.5740557909011841, "logits/rejected": 0.39975208044052124, "logps/accuracies": 0.25, "logps/chosen": -487.64642333984375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -415.9956359863281, "logps/ref_rejected": -316.01708984375, "logps/rejected": -444.40020751953125, "loss": 0.3096, "rewards/accuracies": 0.75, "rewards/chosen": -3.5825393199920654, "rewards/grad_term": 0.013735326007008553, "rewards/margins": 2.836615562438965, "rewards/rejected": -6.419155120849609, "step": 145 }, { "epoch": 0.31280128548473485, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.223483927318776, "learning_rate": 6.57398568019093e-07, "logits/chosen": 0.7999095320701599, "logits/rejected": 0.5960132479667664, "logps/accuracies": 0.75, "logps/chosen": -299.7486572265625, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -218.3001251220703, "logps/ref_rejected": -201.54733276367188, "logps/rejected": -300.24505615234375, "loss": 0.2041, "rewards/accuracies": 0.75, "rewards/chosen": -4.0724263191223145, "rewards/grad_term": 0.02213701605796814, "rewards/margins": 0.8624599575996399, "rewards/rejected": -4.9348859786987305, "step": 146 }, { "epoch": 0.3149437600428495, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.579748942980807, "learning_rate": 6.56563245823389e-07, "logits/chosen": 0.926275372505188, "logits/rejected": 0.8347383737564087, "logps/accuracies": 0.75, "logps/chosen": -363.90753173828125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -273.9169616699219, "logps/ref_rejected": -306.166015625, "logps/rejected": -483.9077453613281, "loss": 0.257, "rewards/accuracies": 0.75, "rewards/chosen": -4.4995293617248535, "rewards/grad_term": 0.007000477984547615, "rewards/margins": 4.387556076049805, "rewards/rejected": -8.887085914611816, "step": 147 }, { "epoch": 0.31708623460096413, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.401901301955716, "learning_rate": 6.557279236276849e-07, "logits/chosen": 0.8783835172653198, "logits/rejected": 0.748361349105835, "logps/accuracies": 0.75, "logps/chosen": -439.5329284667969, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -380.14080810546875, "logps/ref_rejected": -311.24652099609375, "logps/rejected": -461.9928283691406, "loss": 0.2612, "rewards/accuracies": 0.75, "rewards/chosen": -2.969606399536133, "rewards/grad_term": 0.007177918218076229, "rewards/margins": 4.567710876464844, "rewards/rejected": -7.53731632232666, "step": 148 }, { "epoch": 0.31922870915907875, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 10.72349764158799, "learning_rate": 6.548926014319809e-07, "logits/chosen": 0.889015793800354, "logits/rejected": 0.6200151443481445, "logps/accuracies": 0.25, "logps/chosen": -367.63616943359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -328.658203125, "logps/ref_rejected": -265.4963073730469, "logps/rejected": -371.23028564453125, "loss": 0.2875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9488979578018188, "rewards/grad_term": 0.003809453221037984, "rewards/margins": 3.337800979614258, "rewards/rejected": -5.286699295043945, "step": 149 }, { "epoch": 0.32137118371719336, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.234046149517951, "learning_rate": 6.540572792362768e-07, "logits/chosen": 0.7394288182258606, "logits/rejected": 0.7122300863265991, "logps/accuracies": 0.75, "logps/chosen": -405.5599365234375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -309.9801940917969, "logps/ref_rejected": -281.0711975097656, "logps/rejected": -448.6300048828125, "loss": 0.2894, "rewards/accuracies": 1.0, "rewards/chosen": -4.778985977172852, "rewards/grad_term": 0.004231796134263277, "rewards/margins": 3.5989530086517334, "rewards/rejected": -8.377939224243164, "step": 150 }, { "epoch": 0.323513658275308, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.025800847575807, "learning_rate": 6.532219570405727e-07, "logits/chosen": 0.5386347770690918, "logits/rejected": 0.35092538595199585, "logps/accuracies": 0.5, "logps/chosen": -216.28070068359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -184.84176635742188, "logps/ref_rejected": -179.14764404296875, "logps/rejected": -240.31222534179688, "loss": 0.2617, "rewards/accuracies": 1.0, "rewards/chosen": -1.5719472169876099, "rewards/grad_term": 0.012875164858996868, "rewards/margins": 1.4862821102142334, "rewards/rejected": -3.058229446411133, "step": 151 }, { "epoch": 0.3256561328334226, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.899213029036904, "learning_rate": 6.523866348448687e-07, "logits/chosen": 0.9789618253707886, "logits/rejected": 0.363411545753479, "logps/accuracies": 0.5, "logps/chosen": -279.868408203125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -243.69374084472656, "logps/ref_rejected": -157.2808380126953, "logps/rejected": -250.97132873535156, "loss": 0.241, "rewards/accuracies": 0.75, "rewards/chosen": -1.8087329864501953, "rewards/grad_term": 0.009951984509825706, "rewards/margins": 2.875791072845459, "rewards/rejected": -4.6845245361328125, "step": 152 }, { "epoch": 0.3277986073915372, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.604763643820585, "learning_rate": 6.515513126491647e-07, "logits/chosen": 0.8571368455886841, "logits/rejected": 0.25795796513557434, "logps/accuracies": 0.25, "logps/chosen": -449.5162658691406, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -410.160400390625, "logps/ref_rejected": -145.2788543701172, "logps/rejected": -226.03286743164062, "loss": 0.2287, "rewards/accuracies": 0.75, "rewards/chosen": -1.9677932262420654, "rewards/grad_term": 0.015731465071439743, "rewards/margins": 2.0699081420898438, "rewards/rejected": -4.03770112991333, "step": 153 }, { "epoch": 0.3299410819496518, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.719226366200353, "learning_rate": 6.507159904534606e-07, "logits/chosen": 0.6871969103813171, "logits/rejected": 0.4975748658180237, "logps/accuracies": 0.25, "logps/chosen": -325.2784423828125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -273.6944580078125, "logps/ref_rejected": -270.9206237792969, "logps/rejected": -343.53662109375, "loss": 0.2694, "rewards/accuracies": 0.75, "rewards/chosen": -2.5791993141174316, "rewards/grad_term": 0.015888353809714317, "rewards/margins": 1.0516000986099243, "rewards/rejected": -3.6307995319366455, "step": 154 }, { "epoch": 0.3320835565077665, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.321361613018943, "learning_rate": 6.498806682577566e-07, "logits/chosen": 0.703808069229126, "logits/rejected": 0.6969115734100342, "logps/accuracies": 1.0, "logps/chosen": -476.7120361328125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -397.993896484375, "logps/ref_rejected": -352.0650329589844, "logps/rejected": -520.11376953125, "loss": 0.1933, "rewards/accuracies": 1.0, "rewards/chosen": -3.935906171798706, "rewards/grad_term": 0.005700279027223587, "rewards/margins": 4.466530799865723, "rewards/rejected": -8.402437210083008, "step": 155 }, { "epoch": 0.3342260310658811, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.480714775934256, "learning_rate": 6.490453460620525e-07, "logits/chosen": 1.0573246479034424, "logits/rejected": 0.7870907187461853, "logps/accuracies": 0.25, "logps/chosen": -498.836669921875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -427.01885986328125, "logps/ref_rejected": -330.6030578613281, "logps/rejected": -500.70513916015625, "loss": 0.2106, "rewards/accuracies": 1.0, "rewards/chosen": -3.590888023376465, "rewards/grad_term": 0.0047126878052949905, "rewards/margins": 4.914216995239258, "rewards/rejected": -8.505105018615723, "step": 156 }, { "epoch": 0.33636850562399573, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.421964128197317, "learning_rate": 6.482100238663484e-07, "logits/chosen": 0.8880590796470642, "logits/rejected": 0.7068474888801575, "logps/accuracies": 0.75, "logps/chosen": -279.9468994140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -263.3426513671875, "logps/ref_rejected": -203.75613403320312, "logps/rejected": -287.427978515625, "loss": 0.2597, "rewards/accuracies": 1.0, "rewards/chosen": -0.8302121162414551, "rewards/grad_term": 0.008612211793661118, "rewards/margins": 3.353381395339966, "rewards/rejected": -4.18359375, "step": 157 }, { "epoch": 0.33851098018211034, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.8752213374271, "learning_rate": 6.473747016706444e-07, "logits/chosen": 0.7713953852653503, "logits/rejected": 0.7244459986686707, "logps/accuracies": 1.0, "logps/chosen": -343.670654296875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -289.24114990234375, "logps/ref_rejected": -304.27593994140625, "logps/rejected": -411.5351257324219, "loss": 0.2123, "rewards/accuracies": 1.0, "rewards/chosen": -2.7214746475219727, "rewards/grad_term": 0.007066743914037943, "rewards/margins": 2.641486406326294, "rewards/rejected": -5.362961292266846, "step": 158 }, { "epoch": 0.34065345474022496, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.961209126432363, "learning_rate": 6.465393794749403e-07, "logits/chosen": 0.4089430570602417, "logits/rejected": 0.48331207036972046, "logps/accuracies": 1.0, "logps/chosen": -227.35739135742188, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -194.3919219970703, "logps/ref_rejected": -197.94259643554688, "logps/rejected": -291.18634033203125, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": -1.648273229598999, "rewards/grad_term": 0.007467413786798716, "rewards/margins": 3.0139148235321045, "rewards/rejected": -4.6621880531311035, "step": 159 }, { "epoch": 0.3427959292983396, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.623215056313119, "learning_rate": 6.457040572792363e-07, "logits/chosen": 0.7995873093605042, "logits/rejected": 0.8776368498802185, "logps/accuracies": 0.75, "logps/chosen": -625.0630493164062, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -586.06396484375, "logps/ref_rejected": -436.4676818847656, "logps/rejected": -507.0144348144531, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": -1.9499545097351074, "rewards/grad_term": 0.008956504985690117, "rewards/margins": 1.5773828029632568, "rewards/rejected": -3.527337074279785, "step": 160 }, { "epoch": 0.3449384038564542, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.887366061229715, "learning_rate": 6.448687350835322e-07, "logits/chosen": 0.7675126791000366, "logits/rejected": 0.530870258808136, "logps/accuracies": 0.5, "logps/chosen": -353.36944580078125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -317.55523681640625, "logps/ref_rejected": -319.5533447265625, "logps/rejected": -376.0810546875, "loss": 0.2781, "rewards/accuracies": 0.75, "rewards/chosen": -1.790710687637329, "rewards/grad_term": 0.01590893603861332, "rewards/margins": 1.035674810409546, "rewards/rejected": -2.826385498046875, "step": 161 }, { "epoch": 0.3470808784145688, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.945357899951022, "learning_rate": 6.440334128878281e-07, "logits/chosen": 0.586737871170044, "logits/rejected": 0.4655018150806427, "logps/accuracies": 0.5, "logps/chosen": -474.0818786621094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -397.549560546875, "logps/ref_rejected": -303.9313049316406, "logps/rejected": -453.242431640625, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": -3.826618194580078, "rewards/grad_term": 0.005110522732138634, "rewards/margins": 3.6389386653900146, "rewards/rejected": -7.465556621551514, "step": 162 }, { "epoch": 0.3492233529726835, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.421904565149697, "learning_rate": 6.431980906921241e-07, "logits/chosen": 0.8405193090438843, "logits/rejected": 0.9586330652236938, "logps/accuracies": 0.75, "logps/chosen": -362.607666015625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -323.2601318359375, "logps/ref_rejected": -350.115966796875, "logps/rejected": -414.9684143066406, "loss": 0.1903, "rewards/accuracies": 0.75, "rewards/chosen": -1.9673755168914795, "rewards/grad_term": 0.014036407694220543, "rewards/margins": 1.2752478122711182, "rewards/rejected": -3.2426233291625977, "step": 163 }, { "epoch": 0.3513658275307981, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.31758932737487, "learning_rate": 6.4236276849642e-07, "logits/chosen": 0.9069796800613403, "logits/rejected": 0.8120055198669434, "logps/accuracies": 0.5, "logps/chosen": -529.5686645507812, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -436.9043273925781, "logps/ref_rejected": -385.4817199707031, "logps/rejected": -571.4048461914062, "loss": 0.2857, "rewards/accuracies": 1.0, "rewards/chosen": -4.633216381072998, "rewards/grad_term": 0.001609130296856165, "rewards/margins": 4.662940502166748, "rewards/rejected": -9.296156883239746, "step": 164 }, { "epoch": 0.3535083020889127, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 6.7907749776023, "learning_rate": 6.41527446300716e-07, "logits/chosen": 0.7551314830780029, "logits/rejected": 0.6372643709182739, "logps/accuracies": 0.25, "logps/chosen": -594.386962890625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -496.920166015625, "logps/ref_rejected": -391.0464172363281, "logps/rejected": -584.2574462890625, "loss": 0.2221, "rewards/accuracies": 1.0, "rewards/chosen": -4.873340129852295, "rewards/grad_term": 0.0031820686999708414, "rewards/margins": 4.787212371826172, "rewards/rejected": -9.660552978515625, "step": 165 }, { "epoch": 0.3556507766470273, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.42834345389666, "learning_rate": 6.406921241050118e-07, "logits/chosen": 0.8926582336425781, "logits/rejected": 0.6021265983581543, "logps/accuracies": 0.5, "logps/chosen": -535.192138671875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -454.7031555175781, "logps/ref_rejected": -355.3533935546875, "logps/rejected": -487.2066650390625, "loss": 0.2189, "rewards/accuracies": 1.0, "rewards/chosen": -4.024447441101074, "rewards/grad_term": 0.007553639821708202, "rewards/margins": 2.56821608543396, "rewards/rejected": -6.592663764953613, "step": 166 }, { "epoch": 0.35779325120514194, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.44062714406834, "learning_rate": 6.398568019093079e-07, "logits/chosen": 0.9791277647018433, "logits/rejected": 0.7142946720123291, "logps/accuracies": 0.5, "logps/chosen": -324.64971923828125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -303.8039245605469, "logps/ref_rejected": -202.87620544433594, "logps/rejected": -278.5081481933594, "loss": 0.2451, "rewards/accuracies": 1.0, "rewards/chosen": -1.0422909259796143, "rewards/grad_term": 0.009761723689734936, "rewards/margins": 2.7393064498901367, "rewards/rejected": -3.781597137451172, "step": 167 }, { "epoch": 0.35993572576325655, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.985736298029532, "learning_rate": 6.390214797136038e-07, "logits/chosen": 0.6620911955833435, "logits/rejected": 0.6493997573852539, "logps/accuracies": 0.75, "logps/chosen": -189.7664794921875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -157.066650390625, "logps/ref_rejected": -141.00611877441406, "logps/rejected": -213.98069763183594, "loss": 0.2257, "rewards/accuracies": 0.75, "rewards/chosen": -1.6349915266036987, "rewards/grad_term": 0.010831539519131184, "rewards/margins": 2.013737201690674, "rewards/rejected": -3.648728370666504, "step": 168 }, { "epoch": 0.36207820032137117, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.312051247371532, "learning_rate": 6.381861575178997e-07, "logits/chosen": 0.9859127998352051, "logits/rejected": 0.7602252960205078, "logps/accuracies": 1.0, "logps/chosen": -367.61090087890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -327.2779541015625, "logps/ref_rejected": -278.348876953125, "logps/rejected": -416.47991943359375, "loss": 0.2197, "rewards/accuracies": 1.0, "rewards/chosen": -2.016645908355713, "rewards/grad_term": 0.005300410091876984, "rewards/margins": 4.88990592956543, "rewards/rejected": -6.906552314758301, "step": 169 }, { "epoch": 0.3642206748794858, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.642763111324385, "learning_rate": 6.373508353221956e-07, "logits/chosen": 0.8475183248519897, "logits/rejected": 0.7893280386924744, "logps/accuracies": 1.0, "logps/chosen": -466.7734375, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -354.476806640625, "logps/ref_rejected": -379.0105285644531, "logps/rejected": -591.1034545898438, "loss": 0.2312, "rewards/accuracies": 1.0, "rewards/chosen": -5.614831924438477, "rewards/grad_term": 0.0010664674919098616, "rewards/margins": 4.989813804626465, "rewards/rejected": -10.604645729064941, "step": 170 }, { "epoch": 0.36636314943760045, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.565545454910559, "learning_rate": 6.365155131264916e-07, "logits/chosen": 0.7403796911239624, "logits/rejected": 0.6863211393356323, "logps/accuracies": 0.5, "logps/chosen": -413.04949951171875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -341.1173400878906, "logps/ref_rejected": -277.3340148925781, "logps/rejected": -390.7886047363281, "loss": 0.2088, "rewards/accuracies": 1.0, "rewards/chosen": -3.596607208251953, "rewards/grad_term": 0.006689072586596012, "rewards/margins": 2.076122283935547, "rewards/rejected": -5.672729015350342, "step": 171 }, { "epoch": 0.36850562399571507, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.792718661583799, "learning_rate": 6.356801909307876e-07, "logits/chosen": 1.0259259939193726, "logits/rejected": 0.874252438545227, "logps/accuracies": 0.75, "logps/chosen": -411.4170837402344, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -338.65203857421875, "logps/ref_rejected": -322.8931579589844, "logps/rejected": -444.8133544921875, "loss": 0.2075, "rewards/accuracies": 1.0, "rewards/chosen": -3.638251543045044, "rewards/grad_term": 0.00537948589771986, "rewards/margins": 2.4577579498291016, "rewards/rejected": -6.096009254455566, "step": 172 }, { "epoch": 0.3706480985538297, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.152790670378876, "learning_rate": 6.348448687350834e-07, "logits/chosen": 0.9457736015319824, "logits/rejected": 0.7330727577209473, "logps/accuracies": 0.5, "logps/chosen": -449.04766845703125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -385.5958251953125, "logps/ref_rejected": -290.0688171386719, "logps/rejected": -419.5983581542969, "loss": 0.2484, "rewards/accuracies": 1.0, "rewards/chosen": -3.1725926399230957, "rewards/grad_term": 0.004974587354809046, "rewards/margins": 3.3038861751556396, "rewards/rejected": -6.476478576660156, "step": 173 }, { "epoch": 0.3727905731119443, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.514176538085659, "learning_rate": 6.340095465393795e-07, "logits/chosen": 0.6537495851516724, "logits/rejected": 0.8065310716629028, "logps/accuracies": 0.75, "logps/chosen": -366.6497802734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -316.4726867675781, "logps/ref_rejected": -318.7170715332031, "logps/rejected": -415.7598876953125, "loss": 0.2316, "rewards/accuracies": 0.75, "rewards/chosen": -2.508854389190674, "rewards/grad_term": 0.017329072579741478, "rewards/margins": 2.343287467956543, "rewards/rejected": -4.852141857147217, "step": 174 }, { "epoch": 0.3749330476700589, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.348177913147394, "learning_rate": 6.331742243436754e-07, "logits/chosen": 0.8572670221328735, "logits/rejected": 0.791947603225708, "logps/accuracies": 0.75, "logps/chosen": -487.83331298828125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -378.57891845703125, "logps/ref_rejected": -352.18182373046875, "logps/rejected": -550.4212646484375, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": -5.4627180099487305, "rewards/grad_term": 0.0037125912494957447, "rewards/margins": 4.449254989624023, "rewards/rejected": -9.911972045898438, "step": 175 }, { "epoch": 0.37707552222817353, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.515808366477856, "learning_rate": 6.323389021479714e-07, "logits/chosen": 0.8945661783218384, "logits/rejected": 0.6957411170005798, "logps/accuracies": 0.75, "logps/chosen": -261.3265380859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -211.19613647460938, "logps/ref_rejected": -165.06219482421875, "logps/rejected": -267.1302795410156, "loss": 0.2924, "rewards/accuracies": 0.75, "rewards/chosen": -2.5065195560455322, "rewards/grad_term": 0.011427883058786392, "rewards/margins": 2.5968849658966064, "rewards/rejected": -5.103404521942139, "step": 176 }, { "epoch": 0.37921799678628815, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.786001249289741, "learning_rate": 6.315035799522672e-07, "logits/chosen": 0.8739601969718933, "logits/rejected": 0.669786274433136, "logps/accuracies": 0.5, "logps/chosen": -329.546142578125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -280.71832275390625, "logps/ref_rejected": -243.82247924804688, "logps/rejected": -331.6929931640625, "loss": 0.3338, "rewards/accuracies": 1.0, "rewards/chosen": -2.441390037536621, "rewards/grad_term": 0.007867410778999329, "rewards/margins": 1.952134609222412, "rewards/rejected": -4.393524646759033, "step": 177 }, { "epoch": 0.38136047134440276, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.208154660206382, "learning_rate": 6.306682577565633e-07, "logits/chosen": 0.9574888944625854, "logits/rejected": 0.8498983979225159, "logps/accuracies": 1.0, "logps/chosen": -543.029052734375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -424.92236328125, "logps/ref_rejected": -352.2156066894531, "logps/rejected": -606.2493286132812, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": -5.90533447265625, "rewards/grad_term": 8.483060082653537e-05, "rewards/margins": 6.796352386474609, "rewards/rejected": -12.70168685913086, "step": 178 }, { "epoch": 0.38350294590251743, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.640876069823683, "learning_rate": 6.298329355608592e-07, "logits/chosen": 0.9586235284805298, "logits/rejected": 0.8088182210922241, "logps/accuracies": 0.75, "logps/chosen": -398.8780212402344, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -298.6359558105469, "logps/ref_rejected": -262.9993896484375, "logps/rejected": -415.80419921875, "loss": 0.196, "rewards/accuracies": 1.0, "rewards/chosen": -5.012104034423828, "rewards/grad_term": 0.006371453404426575, "rewards/margins": 2.6281375885009766, "rewards/rejected": -7.640241622924805, "step": 179 }, { "epoch": 0.38564542046063205, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.553393120180624, "learning_rate": 6.289976133651551e-07, "logits/chosen": 1.0224734544754028, "logits/rejected": 0.756576657295227, "logps/accuracies": 0.75, "logps/chosen": -353.6854553222656, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -302.3289794921875, "logps/ref_rejected": -237.62245178222656, "logps/rejected": -358.02716064453125, "loss": 0.2158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5678231716156006, "rewards/grad_term": 0.005727603565901518, "rewards/margins": 3.452413558959961, "rewards/rejected": -6.020236492156982, "step": 180 }, { "epoch": 0.38778789501874666, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.447493546427074, "learning_rate": 6.28162291169451e-07, "logits/chosen": 1.0232813358306885, "logits/rejected": 0.7466151714324951, "logps/accuracies": 0.5, "logps/chosen": -400.76641845703125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -331.197998046875, "logps/ref_rejected": -238.2156524658203, "logps/rejected": -412.0185546875, "loss": 0.2115, "rewards/accuracies": 1.0, "rewards/chosen": -3.478421449661255, "rewards/grad_term": 0.0015466721961274743, "rewards/margins": 5.211723804473877, "rewards/rejected": -8.690145492553711, "step": 181 }, { "epoch": 0.3899303695768613, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.697644696654893, "learning_rate": 6.27326968973747e-07, "logits/chosen": 0.9457991123199463, "logits/rejected": 0.7241038084030151, "logps/accuracies": 0.5, "logps/chosen": -442.8717041015625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -379.5626525878906, "logps/ref_rejected": -298.47650146484375, "logps/rejected": -431.15509033203125, "loss": 0.2081, "rewards/accuracies": 0.75, "rewards/chosen": -3.165452480316162, "rewards/grad_term": 0.008516497910022736, "rewards/margins": 3.4684762954711914, "rewards/rejected": -6.6339287757873535, "step": 182 }, { "epoch": 0.3920728441349759, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.8108725640856, "learning_rate": 6.26491646778043e-07, "logits/chosen": 1.0096490383148193, "logits/rejected": 0.8373015522956848, "logps/accuracies": 0.75, "logps/chosen": -351.55303955078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -287.40643310546875, "logps/ref_rejected": -275.7420654296875, "logps/rejected": -407.6207275390625, "loss": 0.2152, "rewards/accuracies": 0.75, "rewards/chosen": -3.2073283195495605, "rewards/grad_term": 0.011961029842495918, "rewards/margins": 3.3866024017333984, "rewards/rejected": -6.593931198120117, "step": 183 }, { "epoch": 0.3942153186930905, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.474698775172937, "learning_rate": 6.256563245823388e-07, "logits/chosen": 0.6665371060371399, "logits/rejected": 0.6652243137359619, "logps/accuracies": 0.75, "logps/chosen": -505.3188171386719, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -441.2266845703125, "logps/ref_rejected": -416.076904296875, "logps/rejected": -567.68310546875, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": -3.204606056213379, "rewards/grad_term": 0.001420854590833187, "rewards/margins": 4.375702857971191, "rewards/rejected": -7.58030891418457, "step": 184 }, { "epoch": 0.3963577932512051, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.943275839760174, "learning_rate": 6.248210023866348e-07, "logits/chosen": 0.9953018426895142, "logits/rejected": 0.795741617679596, "logps/accuracies": 0.5, "logps/chosen": -452.40887451171875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -384.6745910644531, "logps/ref_rejected": -307.6947326660156, "logps/rejected": -431.2263488769531, "loss": 0.2104, "rewards/accuracies": 1.0, "rewards/chosen": -3.386714220046997, "rewards/grad_term": 0.0036700021009892225, "rewards/margins": 2.7898666858673096, "rewards/rejected": -6.176580905914307, "step": 185 }, { "epoch": 0.39850026780931974, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.413011794039116, "learning_rate": 6.239856801909308e-07, "logits/chosen": 0.6716040968894958, "logits/rejected": 0.619194507598877, "logps/accuracies": 0.5, "logps/chosen": -467.7266845703125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -381.2197570800781, "logps/ref_rejected": -290.17254638671875, "logps/rejected": -458.6622314453125, "loss": 0.1671, "rewards/accuracies": 1.0, "rewards/chosen": -4.325344562530518, "rewards/grad_term": 0.004242981784045696, "rewards/margins": 4.0991411209106445, "rewards/rejected": -8.424485206604004, "step": 186 }, { "epoch": 0.4006427423674344, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.119460917704207, "learning_rate": 6.231503579952267e-07, "logits/chosen": 0.8225597143173218, "logits/rejected": 0.6662357449531555, "logps/accuracies": 0.5, "logps/chosen": -469.7325439453125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -364.77740478515625, "logps/ref_rejected": -268.07843017578125, "logps/rejected": -472.31781005859375, "loss": 0.1839, "rewards/accuracies": 1.0, "rewards/chosen": -5.247758388519287, "rewards/grad_term": 0.006101151462644339, "rewards/margins": 4.964210033416748, "rewards/rejected": -10.211968421936035, "step": 187 }, { "epoch": 0.40278521692554903, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.046352079716472, "learning_rate": 6.223150357995226e-07, "logits/chosen": 0.8952844142913818, "logits/rejected": 0.6202220916748047, "logps/accuracies": 1.0, "logps/chosen": -472.1369323730469, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -377.0249938964844, "logps/ref_rejected": -345.5789489746094, "logps/rejected": -568.91357421875, "loss": 0.2002, "rewards/accuracies": 1.0, "rewards/chosen": -4.755597114562988, "rewards/grad_term": 0.0011726694647222757, "rewards/margins": 6.411135673522949, "rewards/rejected": -11.166732788085938, "step": 188 }, { "epoch": 0.40492769148366364, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.37154123816891, "learning_rate": 6.214797136038185e-07, "logits/chosen": 0.8080844879150391, "logits/rejected": 0.843936026096344, "logps/accuracies": 0.25, "logps/chosen": -305.34619140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -231.69813537597656, "logps/ref_rejected": -215.05422973632812, "logps/rejected": -299.5094299316406, "loss": 0.2138, "rewards/accuracies": 0.75, "rewards/chosen": -3.682403087615967, "rewards/grad_term": 0.020120887085795403, "rewards/margins": 0.5403570532798767, "rewards/rejected": -4.222760200500488, "step": 189 }, { "epoch": 0.40707016604177826, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.174975913775362, "learning_rate": 6.206443914081146e-07, "logits/chosen": 0.7329360246658325, "logits/rejected": 0.8288344144821167, "logps/accuracies": 0.5, "logps/chosen": -461.33984375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -360.3821105957031, "logps/ref_rejected": -286.70745849609375, "logps/rejected": -451.14569091796875, "loss": 0.2014, "rewards/accuracies": 0.75, "rewards/chosen": -5.047886848449707, "rewards/grad_term": 0.013596764765679836, "rewards/margins": 3.1740236282348633, "rewards/rejected": -8.22191047668457, "step": 190 }, { "epoch": 0.4092126405998929, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.272281750209924, "learning_rate": 6.198090692124104e-07, "logits/chosen": 0.8074694275856018, "logits/rejected": 0.7827705144882202, "logps/accuracies": 0.5, "logps/chosen": -419.7196960449219, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -305.22576904296875, "logps/ref_rejected": -257.1751708984375, "logps/rejected": -473.9278564453125, "loss": 0.1775, "rewards/accuracies": 0.75, "rewards/chosen": -5.724696159362793, "rewards/grad_term": 0.008914883248507977, "rewards/margins": 5.112937927246094, "rewards/rejected": -10.837634086608887, "step": 191 }, { "epoch": 0.4113551151580075, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.921800868376472, "learning_rate": 6.189737470167064e-07, "logits/chosen": 0.8224954009056091, "logits/rejected": 0.7576948404312134, "logps/accuracies": 0.5, "logps/chosen": -276.3191833496094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -228.04379272460938, "logps/ref_rejected": -168.54588317871094, "logps/rejected": -262.92254638671875, "loss": 0.221, "rewards/accuracies": 0.75, "rewards/chosen": -2.4137697219848633, "rewards/grad_term": 0.011381752789020538, "rewards/margins": 2.305063247680664, "rewards/rejected": -4.718832969665527, "step": 192 }, { "epoch": 0.4113551151580075, "eval_flips/correct->correct": 0.1599999964237213, "eval_flips/correct->incorrect": 0.0, "eval_flips/incorrect->correct": 0.3199999928474426, "eval_flips/incorrect->incorrect": 0.5199999809265137, "eval_logits/chosen": 0.8205481767654419, "eval_logits/rejected": 0.7034481763839722, "eval_logps/accuracies": 0.47999998927116394, "eval_logps/chosen": -390.3917541503906, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -389.7598876953125, "eval_loss": 0.23252426087856293, "eval_rewards/accuracies": 0.8600000143051147, "eval_rewards/chosen": -3.3438057899475098, "eval_rewards/grad_term": 0.009308630600571632, "eval_rewards/margins": 3.2091403007507324, "eval_rewards/rejected": -6.5529465675354, "eval_runtime": 375.2407, "eval_samples_per_second": 4.211, "eval_steps_per_second": 0.133, "step": 192 }, { "epoch": 0.4134975897161221, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 12.176142217784117, "learning_rate": 6.181384248210024e-07, "logits/chosen": 0.9473562836647034, "logits/rejected": 0.800918698310852, "logps/accuracies": 1.0, "logps/chosen": -362.5771484375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -297.9696350097656, "logps/ref_rejected": -254.08636474609375, "logps/rejected": -451.26666259765625, "loss": 0.2425, "rewards/accuracies": 1.0, "rewards/chosen": -3.2303762435913086, "rewards/grad_term": 0.0030712243169546127, "rewards/margins": 6.628638744354248, "rewards/rejected": -9.859014511108398, "step": 193 }, { "epoch": 0.4156400642742367, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.97633567390224, "learning_rate": 6.173031026252983e-07, "logits/chosen": 0.9676415324211121, "logits/rejected": 0.9164015054702759, "logps/accuracies": 0.5, "logps/chosen": -323.4644470214844, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -252.95120239257812, "logps/ref_rejected": -255.23983764648438, "logps/rejected": -357.87921142578125, "loss": 0.243, "rewards/accuracies": 1.0, "rewards/chosen": -3.525662422180176, "rewards/grad_term": 0.009022894315421581, "rewards/margins": 1.606306791305542, "rewards/rejected": -5.131969451904297, "step": 194 }, { "epoch": 0.4177825388323514, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.766543352301676, "learning_rate": 6.164677804295942e-07, "logits/chosen": 0.7897940278053284, "logits/rejected": 0.8066399097442627, "logps/accuracies": 0.5, "logps/chosen": -236.746337890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -197.38284301757812, "logps/ref_rejected": -189.28884887695312, "logps/rejected": -234.21102905273438, "loss": 0.1805, "rewards/accuracies": 0.75, "rewards/chosen": -1.9681750535964966, "rewards/grad_term": 0.022014902904629707, "rewards/margins": 0.2779344618320465, "rewards/rejected": -2.2461094856262207, "step": 195 }, { "epoch": 0.419925013390466, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.405842493569402, "learning_rate": 6.156324582338901e-07, "logits/chosen": 1.0093345642089844, "logits/rejected": 0.8334782719612122, "logps/accuracies": 0.75, "logps/chosen": -445.7866516113281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -337.7933349609375, "logps/ref_rejected": -296.8775329589844, "logps/rejected": -510.7151794433594, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": -5.399665832519531, "rewards/grad_term": 0.0011895910138264298, "rewards/margins": 5.2922163009643555, "rewards/rejected": -10.69188117980957, "step": 196 }, { "epoch": 0.4220674879485806, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.920133033788117, "learning_rate": 6.147971360381862e-07, "logits/chosen": 0.787909209728241, "logits/rejected": 0.75300532579422, "logps/accuracies": 0.5, "logps/chosen": -440.1323547363281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -368.47412109375, "logps/ref_rejected": -353.4744873046875, "logps/rejected": -485.1199951171875, "loss": 0.2506, "rewards/accuracies": 0.75, "rewards/chosen": -3.5829124450683594, "rewards/grad_term": 0.012279342859983444, "rewards/margins": 2.9993643760681152, "rewards/rejected": -6.582277297973633, "step": 197 }, { "epoch": 0.42420996250669524, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.419878913528332, "learning_rate": 6.13961813842482e-07, "logits/chosen": 1.1179535388946533, "logits/rejected": 0.9940972924232483, "logps/accuracies": 1.0, "logps/chosen": -321.8353271484375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -260.9544372558594, "logps/ref_rejected": -232.75350952148438, "logps/rejected": -376.96441650390625, "loss": 0.1569, "rewards/accuracies": 1.0, "rewards/chosen": -3.0440444946289062, "rewards/grad_term": 0.007131978403776884, "rewards/margins": 4.166501045227051, "rewards/rejected": -7.210545539855957, "step": 198 }, { "epoch": 0.42635243706480985, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.76694640949059, "learning_rate": 6.13126491646778e-07, "logits/chosen": 0.9783276915550232, "logits/rejected": 0.6347091794013977, "logps/accuracies": 0.5, "logps/chosen": -333.9166564941406, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -295.6625671386719, "logps/ref_rejected": -178.05490112304688, "logps/rejected": -262.8484191894531, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": -1.9127050638198853, "rewards/grad_term": 0.009310072287917137, "rewards/margins": 2.3269693851470947, "rewards/rejected": -4.2396745681762695, "step": 199 }, { "epoch": 0.42849491162292447, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.871363821825847, "learning_rate": 6.122911694510739e-07, "logits/chosen": 0.713058590888977, "logits/rejected": 0.7966564893722534, "logps/accuracies": 1.0, "logps/chosen": -270.65655517578125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -239.4276580810547, "logps/ref_rejected": -272.2807312011719, "logps/rejected": -361.95782470703125, "loss": 0.1946, "rewards/accuracies": 1.0, "rewards/chosen": -1.561445713043213, "rewards/grad_term": 0.006638450548052788, "rewards/margins": 2.9224092960357666, "rewards/rejected": -4.483855247497559, "step": 200 }, { "epoch": 0.4306373861810391, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.189710382812372, "learning_rate": 6.1145584725537e-07, "logits/chosen": 0.8107240200042725, "logits/rejected": 0.6742185950279236, "logps/accuracies": 0.75, "logps/chosen": -461.14404296875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -375.26239013671875, "logps/ref_rejected": -366.937744140625, "logps/rejected": -556.6005859375, "loss": 0.1953, "rewards/accuracies": 1.0, "rewards/chosen": -4.2940826416015625, "rewards/grad_term": 0.002399621531367302, "rewards/margins": 5.189059734344482, "rewards/rejected": -9.483142852783203, "step": 201 }, { "epoch": 0.4327798607391537, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.981730687281297, "learning_rate": 6.106205250596658e-07, "logits/chosen": 0.8751631379127502, "logits/rejected": 0.8072733879089355, "logps/accuracies": 0.75, "logps/chosen": -323.69873046875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -253.70254516601562, "logps/ref_rejected": -235.48675537109375, "logps/rejected": -390.3155517578125, "loss": 0.2163, "rewards/accuracies": 0.75, "rewards/chosen": -3.4998087882995605, "rewards/grad_term": 0.009065371006727219, "rewards/margins": 4.241631031036377, "rewards/rejected": -7.741440296173096, "step": 202 }, { "epoch": 0.43492233529726837, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.322543837095315, "learning_rate": 6.097852028639618e-07, "logits/chosen": 0.8359104990959167, "logits/rejected": 0.7406368255615234, "logps/accuracies": 1.0, "logps/chosen": -576.8265380859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -415.6806335449219, "logps/ref_rejected": -388.8533935546875, "logps/rejected": -631.3206787109375, "loss": 0.2296, "rewards/accuracies": 1.0, "rewards/chosen": -8.057294845581055, "rewards/grad_term": 0.004110483452677727, "rewards/margins": 4.066068649291992, "rewards/rejected": -12.123363494873047, "step": 203 }, { "epoch": 0.437064809855383, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.923150092849891, "learning_rate": 6.089498806682577e-07, "logits/chosen": 0.9337953329086304, "logits/rejected": 0.5036557912826538, "logps/accuracies": 0.75, "logps/chosen": -321.8887939453125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -268.6906433105469, "logps/ref_rejected": -248.19271850585938, "logps/rejected": -372.2651062011719, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": -2.6599063873291016, "rewards/grad_term": 0.006289066281169653, "rewards/margins": 3.543713092803955, "rewards/rejected": -6.203619480133057, "step": 204 }, { "epoch": 0.4392072844134976, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.602466187696242, "learning_rate": 6.081145584725537e-07, "logits/chosen": 0.799699604511261, "logits/rejected": 0.7706651091575623, "logps/accuracies": 1.0, "logps/chosen": -431.94512939453125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -358.3432312011719, "logps/ref_rejected": -336.4741516113281, "logps/rejected": -577.3936157226562, "loss": 0.2874, "rewards/accuracies": 1.0, "rewards/chosen": -3.6800947189331055, "rewards/grad_term": 0.0002930064802058041, "rewards/margins": 8.365878105163574, "rewards/rejected": -12.04597282409668, "step": 205 }, { "epoch": 0.4413497589716122, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.272614970451562, "learning_rate": 6.072792362768496e-07, "logits/chosen": 0.9661082625389099, "logits/rejected": 0.8308844566345215, "logps/accuracies": 0.75, "logps/chosen": -429.0587158203125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -331.1964111328125, "logps/ref_rejected": -307.45654296875, "logps/rejected": -492.7054138183594, "loss": 0.1984, "rewards/accuracies": 0.75, "rewards/chosen": -4.8931169509887695, "rewards/grad_term": 0.0081776799634099, "rewards/margins": 4.369326114654541, "rewards/rejected": -9.262442588806152, "step": 206 }, { "epoch": 0.44349223352972683, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.721689104729112, "learning_rate": 6.064439140811455e-07, "logits/chosen": 0.9881528615951538, "logits/rejected": 0.8519094586372375, "logps/accuracies": 0.5, "logps/chosen": -221.66915893554688, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -187.01608276367188, "logps/ref_rejected": -149.39364624023438, "logps/rejected": -215.79965209960938, "loss": 0.2711, "rewards/accuracies": 1.0, "rewards/chosen": -1.732654094696045, "rewards/grad_term": 0.008831696584820747, "rewards/margins": 1.587646245956421, "rewards/rejected": -3.320300340652466, "step": 207 }, { "epoch": 0.44563470808784145, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.793737716453558, "learning_rate": 6.056085918854416e-07, "logits/chosen": 1.012654185295105, "logits/rejected": 0.9926575422286987, "logps/accuracies": 0.75, "logps/chosen": -402.0953369140625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -353.88946533203125, "logps/ref_rejected": -340.03839111328125, "logps/rejected": -436.39117431640625, "loss": 0.237, "rewards/accuracies": 1.0, "rewards/chosen": -2.4102942943573, "rewards/grad_term": 0.004451955668628216, "rewards/margins": 2.4073448181152344, "rewards/rejected": -4.817638874053955, "step": 208 }, { "epoch": 0.44777718264595606, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.289495153079532, "learning_rate": 6.047732696897374e-07, "logits/chosen": 0.8617650270462036, "logits/rejected": 0.6589657068252563, "logps/accuracies": 0.5, "logps/chosen": -439.2054443359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -381.7887268066406, "logps/ref_rejected": -318.25933837890625, "logps/rejected": -461.6387634277344, "loss": 0.171, "rewards/accuracies": 1.0, "rewards/chosen": -2.870835304260254, "rewards/grad_term": 0.004674965050071478, "rewards/margins": 4.2981367111206055, "rewards/rejected": -7.168972015380859, "step": 209 }, { "epoch": 0.4499196572040707, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.537792356133702, "learning_rate": 6.039379474940334e-07, "logits/chosen": 0.9008455276489258, "logits/rejected": 0.6296284198760986, "logps/accuracies": 0.5, "logps/chosen": -330.61273193359375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -269.29754638671875, "logps/ref_rejected": -172.04388427734375, "logps/rejected": -298.81842041015625, "loss": 0.196, "rewards/accuracies": 1.0, "rewards/chosen": -3.065760612487793, "rewards/grad_term": 0.009436404332518578, "rewards/margins": 3.272966146469116, "rewards/rejected": -6.338726997375488, "step": 210 }, { "epoch": 0.45206213176218535, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.509811900422134, "learning_rate": 6.031026252983293e-07, "logits/chosen": 0.9384500980377197, "logits/rejected": 0.7233452200889587, "logps/accuracies": 0.5, "logps/chosen": -262.9141845703125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -203.38661193847656, "logps/ref_rejected": -200.27188110351562, "logps/rejected": -254.28463745117188, "loss": 0.2868, "rewards/accuracies": 0.5, "rewards/chosen": -2.976378917694092, "rewards/grad_term": 0.027324385941028595, "rewards/margins": -0.275741308927536, "rewards/rejected": -2.7006378173828125, "step": 211 }, { "epoch": 0.45420460632029996, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.335767333746587, "learning_rate": 6.022673031026253e-07, "logits/chosen": 0.8327051401138306, "logits/rejected": 0.7020426988601685, "logps/accuracies": 0.75, "logps/chosen": -380.3292236328125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -343.1072998046875, "logps/ref_rejected": -288.2660217285156, "logps/rejected": -439.3631591796875, "loss": 0.1867, "rewards/accuracies": 1.0, "rewards/chosen": -1.8610966205596924, "rewards/grad_term": 0.005783412139862776, "rewards/margins": 5.693758964538574, "rewards/rejected": -7.554856300354004, "step": 212 }, { "epoch": 0.4563470808784146, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.086663858024284, "learning_rate": 6.014319809069212e-07, "logits/chosen": 0.8908300995826721, "logits/rejected": 0.8858309984207153, "logps/accuracies": 0.5, "logps/chosen": -195.6943359375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -161.84103393554688, "logps/ref_rejected": -157.9658966064453, "logps/rejected": -229.5589141845703, "loss": 0.1993, "rewards/accuracies": 1.0, "rewards/chosen": -1.6926652193069458, "rewards/grad_term": 0.008458103984594345, "rewards/margins": 1.8869857788085938, "rewards/rejected": -3.57965087890625, "step": 213 }, { "epoch": 0.4584895554365292, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.664284257646828, "learning_rate": 6.005966587112171e-07, "logits/chosen": 0.7465036511421204, "logits/rejected": 0.6328434348106384, "logps/accuracies": 0.25, "logps/chosen": -285.3111267089844, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -251.16213989257812, "logps/ref_rejected": -238.5145721435547, "logps/rejected": -321.26043701171875, "loss": 0.1983, "rewards/accuracies": 1.0, "rewards/chosen": -1.7074486017227173, "rewards/grad_term": 0.007980713620781898, "rewards/margins": 2.4298453330993652, "rewards/rejected": -4.137293815612793, "step": 214 }, { "epoch": 0.4606320299946438, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.440221701148605, "learning_rate": 5.997613365155131e-07, "logits/chosen": 0.44053223729133606, "logits/rejected": 0.45330262184143066, "logps/accuracies": 0.5, "logps/chosen": -552.835205078125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -504.29376220703125, "logps/ref_rejected": -250.1544189453125, "logps/rejected": -491.84893798828125, "loss": 0.1717, "rewards/accuracies": 1.0, "rewards/chosen": -2.4270737171173096, "rewards/grad_term": 0.0030039078556001186, "rewards/margins": 9.657651901245117, "rewards/rejected": -12.084726333618164, "step": 215 }, { "epoch": 0.4627745045527584, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.9857967128079475, "learning_rate": 5.989260143198091e-07, "logits/chosen": 0.7373754978179932, "logits/rejected": 0.7481766939163208, "logps/accuracies": 0.5, "logps/chosen": -352.7110290527344, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -312.5104675292969, "logps/ref_rejected": -294.700439453125, "logps/rejected": -409.80865478515625, "loss": 0.1965, "rewards/accuracies": 0.75, "rewards/chosen": -2.010028600692749, "rewards/grad_term": 0.008961998857557774, "rewards/margins": 3.7453832626342773, "rewards/rejected": -5.7554121017456055, "step": 216 }, { "epoch": 0.46491697911087304, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.188711876243962, "learning_rate": 5.98090692124105e-07, "logits/chosen": 0.9295454025268555, "logits/rejected": 0.6259232759475708, "logps/accuracies": 0.5, "logps/chosen": -415.08636474609375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -358.3822021484375, "logps/ref_rejected": -257.28253173828125, "logps/rejected": -432.2508544921875, "loss": 0.22, "rewards/accuracies": 1.0, "rewards/chosen": -2.8352086544036865, "rewards/grad_term": 0.00315161794424057, "rewards/margins": 5.913206577301025, "rewards/rejected": -8.748414993286133, "step": 217 }, { "epoch": 0.46705945366898766, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.492749863968397, "learning_rate": 5.972553699284009e-07, "logits/chosen": 0.9154873490333557, "logits/rejected": 0.7640275955200195, "logps/accuracies": 0.5, "logps/chosen": -264.8331604003906, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -208.0253448486328, "logps/ref_rejected": -161.46011352539062, "logps/rejected": -261.93133544921875, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": -2.8403897285461426, "rewards/grad_term": 0.009426168166100979, "rewards/margins": 2.183171510696411, "rewards/rejected": -5.023561477661133, "step": 218 }, { "epoch": 0.4692019282271023, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.503071502702135, "learning_rate": 5.96420047732697e-07, "logits/chosen": 0.7595028281211853, "logits/rejected": 0.800428032875061, "logps/accuracies": 1.0, "logps/chosen": -582.7049560546875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -475.7439270019531, "logps/ref_rejected": -485.4010009765625, "logps/rejected": -718.46337890625, "loss": 0.2029, "rewards/accuracies": 1.0, "rewards/chosen": -5.348053932189941, "rewards/grad_term": 0.0018055308610200882, "rewards/margins": 6.3050642013549805, "rewards/rejected": -11.653118133544922, "step": 219 }, { "epoch": 0.47134440278521694, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.843688764755232, "learning_rate": 5.955847255369928e-07, "logits/chosen": 0.773861825466156, "logits/rejected": 0.7292585372924805, "logps/accuracies": 0.75, "logps/chosen": -431.9044494628906, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -366.72967529296875, "logps/ref_rejected": -330.95587158203125, "logps/rejected": -468.05487060546875, "loss": 0.1818, "rewards/accuracies": 1.0, "rewards/chosen": -3.2587406635284424, "rewards/grad_term": 0.006896655540913343, "rewards/margins": 3.59621000289917, "rewards/rejected": -6.854950904846191, "step": 220 }, { "epoch": 0.47348687734333156, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.079186736183265, "learning_rate": 5.947494033412888e-07, "logits/chosen": 0.9851402044296265, "logits/rejected": 0.7796863317489624, "logps/accuracies": 0.75, "logps/chosen": -564.6864624023438, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -508.427734375, "logps/ref_rejected": -418.2183837890625, "logps/rejected": -642.1084594726562, "loss": 0.2627, "rewards/accuracies": 1.0, "rewards/chosen": -2.8129372596740723, "rewards/grad_term": 0.00010136763739865273, "rewards/margins": 8.381568908691406, "rewards/rejected": -11.19450569152832, "step": 221 }, { "epoch": 0.4756293519014462, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.596915507884203, "learning_rate": 5.939140811455847e-07, "logits/chosen": 0.9327103495597839, "logits/rejected": 0.7600051164627075, "logps/accuracies": 0.25, "logps/chosen": -458.21246337890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -412.7664489746094, "logps/ref_rejected": -339.1446533203125, "logps/rejected": -441.3128662109375, "loss": 0.2477, "rewards/accuracies": 1.0, "rewards/chosen": -2.2723002433776855, "rewards/grad_term": 0.0055382088758051395, "rewards/margins": 2.8361098766326904, "rewards/rejected": -5.108409881591797, "step": 222 }, { "epoch": 0.4777718264595608, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.540065404325883, "learning_rate": 5.930787589498806e-07, "logits/chosen": 0.6721053123474121, "logits/rejected": 0.5771878957748413, "logps/accuracies": 0.5, "logps/chosen": -351.98284912109375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -314.6383056640625, "logps/ref_rejected": -232.02334594726562, "logps/rejected": -344.7238464355469, "loss": 0.1544, "rewards/accuracies": 1.0, "rewards/chosen": -1.867226243019104, "rewards/grad_term": 0.00492095947265625, "rewards/margins": 3.7677993774414062, "rewards/rejected": -5.635025978088379, "step": 223 }, { "epoch": 0.4799143010176754, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 6.681718039597231, "learning_rate": 5.922434367541766e-07, "logits/chosen": 0.9633818864822388, "logits/rejected": 0.7339221239089966, "logps/accuracies": 0.25, "logps/chosen": -494.77099609375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -442.48052978515625, "logps/ref_rejected": -331.5393981933594, "logps/rejected": -480.627685546875, "loss": 0.1871, "rewards/accuracies": 1.0, "rewards/chosen": -2.6145222187042236, "rewards/grad_term": 0.000996602582745254, "rewards/margins": 4.839890480041504, "rewards/rejected": -7.454412937164307, "step": 224 }, { "epoch": 0.48205677557579, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.005924963769166, "learning_rate": 5.914081145584725e-07, "logits/chosen": 0.8806890845298767, "logits/rejected": 0.6015447974205017, "logps/accuracies": 0.25, "logps/chosen": -366.54046630859375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -308.4888000488281, "logps/ref_rejected": -246.00994873046875, "logps/rejected": -355.2261047363281, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": -2.902583599090576, "rewards/grad_term": 0.0076670260168612, "rewards/margins": 2.558225631713867, "rewards/rejected": -5.460808753967285, "step": 225 }, { "epoch": 0.48419925013390464, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.189114548174568, "learning_rate": 5.905727923627685e-07, "logits/chosen": 0.8432016968727112, "logits/rejected": 0.4910334646701813, "logps/accuracies": 0.5, "logps/chosen": -529.419677734375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -399.1881103515625, "logps/ref_rejected": -216.96324157714844, "logps/rejected": -383.4134216308594, "loss": 0.2154, "rewards/accuracies": 0.75, "rewards/chosen": -6.511577606201172, "rewards/grad_term": 0.013775240629911423, "rewards/margins": 1.8109302520751953, "rewards/rejected": -8.32250690460205, "step": 226 }, { "epoch": 0.4863417246920193, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.812465251320808, "learning_rate": 5.897374701670644e-07, "logits/chosen": 0.9737125039100647, "logits/rejected": 0.8655239939689636, "logps/accuracies": 0.75, "logps/chosen": -478.7800598144531, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -391.8121643066406, "logps/ref_rejected": -331.19952392578125, "logps/rejected": -510.9104309082031, "loss": 0.1759, "rewards/accuracies": 1.0, "rewards/chosen": -4.348394393920898, "rewards/grad_term": 0.0018389918841421604, "rewards/margins": 4.637151718139648, "rewards/rejected": -8.985546112060547, "step": 227 }, { "epoch": 0.4884841992501339, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.807007302887406, "learning_rate": 5.889021479713604e-07, "logits/chosen": 0.5967141389846802, "logits/rejected": 0.588777482509613, "logps/accuracies": 0.75, "logps/chosen": -175.18948364257812, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -160.39163208007812, "logps/ref_rejected": -131.2773895263672, "logps/rejected": -182.36181640625, "loss": 0.1968, "rewards/accuracies": 1.0, "rewards/chosen": -0.7398918867111206, "rewards/grad_term": 0.008824177086353302, "rewards/margins": 1.8143287897109985, "rewards/rejected": -2.554220676422119, "step": 228 }, { "epoch": 0.49062667380824854, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.665149636062972, "learning_rate": 5.880668257756563e-07, "logits/chosen": 0.9138537645339966, "logits/rejected": 0.8063441514968872, "logps/accuracies": 0.5, "logps/chosen": -377.0699768066406, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -307.6856384277344, "logps/ref_rejected": -232.5096435546875, "logps/rejected": -368.8970031738281, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": -3.4692177772521973, "rewards/grad_term": 0.00434906966984272, "rewards/margins": 3.3501501083374023, "rewards/rejected": -6.819368362426758, "step": 229 }, { "epoch": 0.49276914836636315, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.623147163579497, "learning_rate": 5.872315035799522e-07, "logits/chosen": 0.8319353461265564, "logits/rejected": 0.7092019319534302, "logps/accuracies": 0.5, "logps/chosen": -471.31524658203125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -425.01544189453125, "logps/ref_rejected": -352.10040283203125, "logps/rejected": -474.41741943359375, "loss": 0.1697, "rewards/accuracies": 1.0, "rewards/chosen": -2.3149900436401367, "rewards/grad_term": 0.007428554352372885, "rewards/margins": 3.8008623123168945, "rewards/rejected": -6.115852355957031, "step": 230 }, { "epoch": 0.49491162292447777, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.671116246729667, "learning_rate": 5.863961813842482e-07, "logits/chosen": 0.7218674421310425, "logits/rejected": 0.6530136466026306, "logps/accuracies": 0.75, "logps/chosen": -417.69482421875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -333.3896789550781, "logps/ref_rejected": -285.8385009765625, "logps/rejected": -459.2354431152344, "loss": 0.1788, "rewards/accuracies": 1.0, "rewards/chosen": -4.215256690979004, "rewards/grad_term": 0.004520585294812918, "rewards/margins": 4.454591274261475, "rewards/rejected": -8.66984748840332, "step": 231 }, { "epoch": 0.4970540974825924, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.021075456821866, "learning_rate": 5.855608591885441e-07, "logits/chosen": 0.732083797454834, "logits/rejected": 0.4362190365791321, "logps/accuracies": 0.75, "logps/chosen": -291.83062744140625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -250.48623657226562, "logps/ref_rejected": -179.92550659179688, "logps/rejected": -318.45306396484375, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": -2.0672202110290527, "rewards/grad_term": 0.002242325572296977, "rewards/margins": 4.859157562255859, "rewards/rejected": -6.92637825012207, "step": 232 }, { "epoch": 0.499196572040707, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.428848978245107, "learning_rate": 5.847255369928401e-07, "logits/chosen": 0.6398648023605347, "logits/rejected": 0.5878071784973145, "logps/accuracies": 0.75, "logps/chosen": -290.29071044921875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -244.7794647216797, "logps/ref_rejected": -252.53553771972656, "logps/rejected": -354.963134765625, "loss": 0.2229, "rewards/accuracies": 1.0, "rewards/chosen": -2.275561571121216, "rewards/grad_term": 0.010291634127497673, "rewards/margins": 2.8458194732666016, "rewards/rejected": -5.121380805969238, "step": 233 }, { "epoch": 0.5013390465988217, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.246143439813943, "learning_rate": 5.83890214797136e-07, "logits/chosen": 0.9107778072357178, "logits/rejected": 0.7426069378852844, "logps/accuracies": 0.75, "logps/chosen": -474.1260986328125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -367.388671875, "logps/ref_rejected": -272.3711242675781, "logps/rejected": -551.78857421875, "loss": 0.1338, "rewards/accuracies": 1.0, "rewards/chosen": -5.3368730545043945, "rewards/grad_term": 0.0010807998478412628, "rewards/margins": 8.633999824523926, "rewards/rejected": -13.970873832702637, "step": 234 }, { "epoch": 0.5034815211569362, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 15.830907508822516, "learning_rate": 5.83054892601432e-07, "logits/chosen": 0.9917585849761963, "logits/rejected": 0.7569248080253601, "logps/accuracies": 1.0, "logps/chosen": -554.43359375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -480.7333068847656, "logps/ref_rejected": -434.622314453125, "logps/rejected": -575.601318359375, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": -3.685014486312866, "rewards/grad_term": 0.002452064771205187, "rewards/margins": 3.3639354705810547, "rewards/rejected": -7.0489501953125, "step": 235 }, { "epoch": 0.5056239957150509, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.346658607688704, "learning_rate": 5.822195704057279e-07, "logits/chosen": 0.623228132724762, "logits/rejected": 0.5134543180465698, "logps/accuracies": 0.75, "logps/chosen": -271.30902099609375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -203.46876525878906, "logps/ref_rejected": -188.93699645996094, "logps/rejected": -337.220947265625, "loss": 0.1585, "rewards/accuracies": 1.0, "rewards/chosen": -3.392012596130371, "rewards/grad_term": 0.009173048660159111, "rewards/margins": 4.022184371948242, "rewards/rejected": -7.4141974449157715, "step": 236 }, { "epoch": 0.5077664702731655, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.626683894288021, "learning_rate": 5.813842482100238e-07, "logits/chosen": 0.8421118855476379, "logits/rejected": 0.7152860760688782, "logps/accuracies": 0.5, "logps/chosen": -363.5376892089844, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -290.9403381347656, "logps/ref_rejected": -304.65911865234375, "logps/rejected": -451.55059814453125, "loss": 0.239, "rewards/accuracies": 1.0, "rewards/chosen": -3.629868268966675, "rewards/grad_term": 0.005975798238068819, "rewards/margins": 3.7147045135498047, "rewards/rejected": -7.344573020935059, "step": 237 }, { "epoch": 0.5099089448312801, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.218899504442438, "learning_rate": 5.805489260143197e-07, "logits/chosen": 0.7464509010314941, "logits/rejected": 0.5703651309013367, "logps/accuracies": 0.75, "logps/chosen": -530.9554443359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -405.5935363769531, "logps/ref_rejected": -343.15496826171875, "logps/rejected": -560.3563232421875, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": -6.268095970153809, "rewards/grad_term": 0.007501318119466305, "rewards/margins": 4.591974258422852, "rewards/rejected": -10.86007022857666, "step": 238 }, { "epoch": 0.5120514193893948, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.019653658814029, "learning_rate": 5.797136038186157e-07, "logits/chosen": 0.8765060901641846, "logits/rejected": 0.5701332688331604, "logps/accuracies": 0.5, "logps/chosen": -290.7047424316406, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -251.8502960205078, "logps/ref_rejected": -161.183837890625, "logps/rejected": -263.0386962890625, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": -1.942723035812378, "rewards/grad_term": 0.005877365358173847, "rewards/margins": 3.1500186920166016, "rewards/rejected": -5.092741966247559, "step": 239 }, { "epoch": 0.5141938939475094, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 5.853619053843736, "learning_rate": 5.788782816229117e-07, "logits/chosen": 0.6634964346885681, "logits/rejected": 0.6507644653320312, "logps/accuracies": 1.0, "logps/chosen": -361.76287841796875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -260.14697265625, "logps/ref_rejected": -247.04840087890625, "logps/rejected": -400.8096618652344, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": -5.080793857574463, "rewards/grad_term": 0.00737812090665102, "rewards/margins": 2.6072704792022705, "rewards/rejected": -7.688064098358154, "step": 240 }, { "epoch": 0.516336368505624, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 13.513765248794778, "learning_rate": 5.780429594272076e-07, "logits/chosen": 0.8699438571929932, "logits/rejected": 0.7703713774681091, "logps/accuracies": 0.75, "logps/chosen": -379.037353515625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -295.375, "logps/ref_rejected": -275.28857421875, "logps/rejected": -539.343017578125, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": -4.1831183433532715, "rewards/grad_term": 0.002768411999568343, "rewards/margins": 9.019603729248047, "rewards/rejected": -13.202722549438477, "step": 241 }, { "epoch": 0.5184788430637386, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.871277063040404, "learning_rate": 5.772076372315036e-07, "logits/chosen": 0.7803428769111633, "logits/rejected": 0.6543869376182556, "logps/accuracies": 0.5, "logps/chosen": -564.8624267578125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -448.4819030761719, "logps/ref_rejected": -379.95556640625, "logps/rejected": -606.3104248046875, "loss": 0.2026, "rewards/accuracies": 0.75, "rewards/chosen": -5.81902551651001, "rewards/grad_term": 0.00902615487575531, "rewards/margins": 5.498717784881592, "rewards/rejected": -11.317742347717285, "step": 242 }, { "epoch": 0.5206213176218533, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.769246851000191, "learning_rate": 5.763723150357995e-07, "logits/chosen": 0.6692153811454773, "logits/rejected": 0.896048367023468, "logps/accuracies": 0.75, "logps/chosen": -439.7333679199219, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -363.28973388671875, "logps/ref_rejected": -574.2049560546875, "logps/rejected": -756.08203125, "loss": 0.155, "rewards/accuracies": 1.0, "rewards/chosen": -3.8221817016601562, "rewards/grad_term": 0.0039497376419603825, "rewards/margins": 5.271674156188965, "rewards/rejected": -9.093855857849121, "step": 243 }, { "epoch": 0.5227637921799678, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.5351054842566185, "learning_rate": 5.755369928400955e-07, "logits/chosen": 0.9571207761764526, "logits/rejected": 0.7909256815910339, "logps/accuracies": 0.75, "logps/chosen": -415.9464111328125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -306.4901428222656, "logps/ref_rejected": -293.0599060058594, "logps/rejected": -455.033935546875, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": -5.472814559936523, "rewards/grad_term": 0.005084376782178879, "rewards/margins": 2.625887870788574, "rewards/rejected": -8.098702430725098, "step": 244 }, { "epoch": 0.5249062667380825, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 16.74633297722825, "learning_rate": 5.747016706443913e-07, "logits/chosen": 0.8858977556228638, "logits/rejected": 0.7780598998069763, "logps/accuracies": 0.75, "logps/chosen": -464.2574462890625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -314.8685302734375, "logps/ref_rejected": -267.0308532714844, "logps/rejected": -507.6094055175781, "loss": 0.1805, "rewards/accuracies": 1.0, "rewards/chosen": -7.469447135925293, "rewards/grad_term": 0.007684916723519564, "rewards/margins": 4.559481620788574, "rewards/rejected": -12.028928756713867, "step": 245 }, { "epoch": 0.527048741296197, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.317686067596801, "learning_rate": 5.738663484486874e-07, "logits/chosen": 0.30461055040359497, "logits/rejected": 0.4746954143047333, "logps/accuracies": 0.75, "logps/chosen": -107.72335815429688, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -91.86939239501953, "logps/ref_rejected": -89.95404052734375, "logps/rejected": -137.3914031982422, "loss": 0.22, "rewards/accuracies": 1.0, "rewards/chosen": -0.792698323726654, "rewards/grad_term": 0.012737632729113102, "rewards/margins": 1.5791699886322021, "rewards/rejected": -2.371868371963501, "step": 246 }, { "epoch": 0.5291912158543117, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.560628687601231, "learning_rate": 5.730310262529833e-07, "logits/chosen": 0.7160434722900391, "logits/rejected": 0.5276747941970825, "logps/accuracies": 1.0, "logps/chosen": -328.2812805175781, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -279.67138671875, "logps/ref_rejected": -251.840576171875, "logps/rejected": -399.2860107421875, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": -2.4304940700531006, "rewards/grad_term": 0.0015676068142056465, "rewards/margins": 4.941778659820557, "rewards/rejected": -7.372272968292236, "step": 247 }, { "epoch": 0.5313336904124264, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.552825805844636, "learning_rate": 5.721957040572792e-07, "logits/chosen": 0.665590763092041, "logits/rejected": 0.6970337629318237, "logps/accuracies": 1.0, "logps/chosen": -445.61358642578125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -375.84332275390625, "logps/ref_rejected": -374.01043701171875, "logps/rejected": -503.5439453125, "loss": 0.1957, "rewards/accuracies": 1.0, "rewards/chosen": -3.4885122776031494, "rewards/grad_term": 0.005766700953245163, "rewards/margins": 2.9881629943847656, "rewards/rejected": -6.476675510406494, "step": 248 }, { "epoch": 0.533476164970541, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 6.35131355845974, "learning_rate": 5.713603818615751e-07, "logits/chosen": 0.8978402614593506, "logits/rejected": 0.5600339770317078, "logps/accuracies": 0.25, "logps/chosen": -461.3507385253906, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -393.03179931640625, "logps/ref_rejected": -272.4175109863281, "logps/rejected": -435.42462158203125, "loss": 0.1457, "rewards/accuracies": 1.0, "rewards/chosen": -3.415947675704956, "rewards/grad_term": 0.004269158001989126, "rewards/margins": 4.734410285949707, "rewards/rejected": -8.150358200073242, "step": 249 }, { "epoch": 0.5356186395286556, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 17.580885514236424, "learning_rate": 5.705250596658711e-07, "logits/chosen": 0.6814495921134949, "logits/rejected": 0.75849449634552, "logps/accuracies": 0.75, "logps/chosen": -353.9872131347656, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -253.66598510742188, "logps/ref_rejected": -231.22552490234375, "logps/rejected": -383.4581298828125, "loss": 0.2176, "rewards/accuracies": 1.0, "rewards/chosen": -5.016061305999756, "rewards/grad_term": 0.007379300892353058, "rewards/margins": 2.595566987991333, "rewards/rejected": -7.611629009246826, "step": 250 }, { "epoch": 0.5377611140867702, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.1971499013479, "learning_rate": 5.696897374701671e-07, "logits/chosen": 0.6412093639373779, "logits/rejected": 0.6849941611289978, "logps/accuracies": 0.75, "logps/chosen": -354.33367919921875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -277.5402526855469, "logps/ref_rejected": -235.74111938476562, "logps/rejected": -374.9162902832031, "loss": 0.2137, "rewards/accuracies": 0.75, "rewards/chosen": -3.839670181274414, "rewards/grad_term": 0.01258145458996296, "rewards/margins": 3.119089126586914, "rewards/rejected": -6.958759307861328, "step": 251 }, { "epoch": 0.5399035886448849, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.69469276334864, "learning_rate": 5.68854415274463e-07, "logits/chosen": 0.8622183799743652, "logits/rejected": 0.5919508337974548, "logps/accuracies": 0.5, "logps/chosen": -274.1978759765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -238.06967163085938, "logps/ref_rejected": -164.38018798828125, "logps/rejected": -275.35992431640625, "loss": 0.2049, "rewards/accuracies": 0.75, "rewards/chosen": -1.806409478187561, "rewards/grad_term": 0.010785759426653385, "rewards/margins": 3.742577075958252, "rewards/rejected": -5.548986434936523, "step": 252 }, { "epoch": 0.5420460632029994, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.021225410718323, "learning_rate": 5.680190930787589e-07, "logits/chosen": 0.7470685243606567, "logits/rejected": 0.6984888911247253, "logps/accuracies": 0.75, "logps/chosen": -221.68431091308594, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -187.86117553710938, "logps/ref_rejected": -150.86964416503906, "logps/rejected": -256.9394836425781, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": -1.6911565065383911, "rewards/grad_term": 0.011116426438093185, "rewards/margins": 3.612335205078125, "rewards/rejected": -5.303491592407227, "step": 253 }, { "epoch": 0.5441885377611141, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.151951865155123, "learning_rate": 5.671837708830549e-07, "logits/chosen": 0.22945332527160645, "logits/rejected": 0.5243977308273315, "logps/accuracies": 0.5, "logps/chosen": -282.75384521484375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -227.72982788085938, "logps/ref_rejected": -320.92535400390625, "logps/rejected": -430.75787353515625, "loss": 0.1473, "rewards/accuracies": 0.75, "rewards/chosen": -2.7512013912200928, "rewards/grad_term": 0.013240108266472816, "rewards/margins": 2.740424394607544, "rewards/rejected": -5.491625785827637, "step": 254 }, { "epoch": 0.5463310123192288, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.292561948011548, "learning_rate": 5.663484486873508e-07, "logits/chosen": 0.7049826979637146, "logits/rejected": 0.7030065059661865, "logps/accuracies": 1.0, "logps/chosen": -465.6855773925781, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -391.0091552734375, "logps/ref_rejected": -373.00775146484375, "logps/rejected": -555.9122314453125, "loss": 0.1945, "rewards/accuracies": 1.0, "rewards/chosen": -3.733822822570801, "rewards/grad_term": 0.001258535892702639, "rewards/margins": 5.411401271820068, "rewards/rejected": -9.145223617553711, "step": 255 }, { "epoch": 0.5484734868773433, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 12.415742091600887, "learning_rate": 5.655131264916467e-07, "logits/chosen": 0.9153692722320557, "logits/rejected": 0.5475519895553589, "logps/accuracies": 0.5, "logps/chosen": -304.13330078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -245.09902954101562, "logps/ref_rejected": -193.71463012695312, "logps/rejected": -393.2826843261719, "loss": 0.2106, "rewards/accuracies": 0.75, "rewards/chosen": -2.951713800430298, "rewards/grad_term": 0.011270977556705475, "rewards/margins": 7.026688575744629, "rewards/rejected": -9.978402137756348, "step": 256 }, { "epoch": 0.5484734868773433, "eval_flips/correct->correct": 0.14000000059604645, "eval_flips/correct->incorrect": 0.019999999552965164, "eval_flips/incorrect->correct": 0.4399999976158142, "eval_flips/incorrect->incorrect": 0.4000000059604645, "eval_logits/chosen": 0.7667725086212158, "eval_logits/rejected": 0.6504298448562622, "eval_logps/accuracies": 0.5799999833106995, "eval_logps/chosen": -395.9922790527344, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -407.676025390625, "eval_loss": 0.19521716237068176, "eval_rewards/accuracies": 0.8399999737739563, "eval_rewards/chosen": -3.6238298416137695, "eval_rewards/grad_term": 0.008681231178343296, "eval_rewards/margins": 3.824923038482666, "eval_rewards/rejected": -7.4487528800964355, "eval_runtime": 372.955, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.134, "step": 256 }, { "epoch": 0.550615961435458, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.31965267361933, "learning_rate": 5.646778042959426e-07, "logits/chosen": 0.9101255536079407, "logits/rejected": 0.8786407113075256, "logps/accuracies": 0.75, "logps/chosen": -516.9441528320312, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -426.1374206542969, "logps/ref_rejected": -390.5966796875, "logps/rejected": -595.52685546875, "loss": 0.1809, "rewards/accuracies": 1.0, "rewards/chosen": -4.540337085723877, "rewards/grad_term": 0.0009564714273437858, "rewards/margins": 5.706172466278076, "rewards/rejected": -10.246509552001953, "step": 257 }, { "epoch": 0.5527584359935725, "flips/correct->correct": 1.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.682059725515248, "learning_rate": 5.638424821002387e-07, "logits/chosen": 0.8498424291610718, "logits/rejected": 0.8489320874214172, "logps/accuracies": 1.0, "logps/chosen": -413.9257507324219, "logps/ref_accuracies": 1.0, "logps/ref_chosen": -327.97869873046875, "logps/ref_rejected": -369.13482666015625, "logps/rejected": -582.4822998046875, "loss": 0.169, "rewards/accuracies": 1.0, "rewards/chosen": -4.2973527908325195, "rewards/grad_term": 0.005528903566300869, "rewards/margins": 6.370021820068359, "rewards/rejected": -10.667373657226562, "step": 258 }, { "epoch": 0.5549009105516872, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.707978512883248, "learning_rate": 5.630071599045346e-07, "logits/chosen": 0.7243056297302246, "logits/rejected": 0.6144933104515076, "logps/accuracies": 0.75, "logps/chosen": -426.29638671875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -336.075439453125, "logps/ref_rejected": -338.01348876953125, "logps/rejected": -613.092041015625, "loss": 0.1699, "rewards/accuracies": 1.0, "rewards/chosen": -4.511048316955566, "rewards/grad_term": 0.00518822530284524, "rewards/margins": 9.242880821228027, "rewards/rejected": -13.753929138183594, "step": 259 }, { "epoch": 0.5570433851098018, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.457819309849194, "learning_rate": 5.621718377088305e-07, "logits/chosen": 0.7747801542282104, "logits/rejected": 0.6980942487716675, "logps/accuracies": 1.0, "logps/chosen": -333.2770080566406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -263.5532531738281, "logps/ref_rejected": -270.31927490234375, "logps/rejected": -400.2752990722656, "loss": 0.1812, "rewards/accuracies": 0.75, "rewards/chosen": -3.486187696456909, "rewards/grad_term": 0.014478763565421104, "rewards/margins": 3.0116138458251953, "rewards/rejected": -6.497801780700684, "step": 260 }, { "epoch": 0.5591858596679165, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 12.791539513426143, "learning_rate": 5.613365155131265e-07, "logits/chosen": 1.010023832321167, "logits/rejected": 0.8089584708213806, "logps/accuracies": 0.5, "logps/chosen": -434.7796325683594, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -369.63201904296875, "logps/ref_rejected": -312.5323791503906, "logps/rejected": -433.6856994628906, "loss": 0.2167, "rewards/accuracies": 0.75, "rewards/chosen": -3.2573814392089844, "rewards/grad_term": 0.011615730822086334, "rewards/margins": 2.800283908843994, "rewards/rejected": -6.057665824890137, "step": 261 }, { "epoch": 0.561328334226031, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 5.761871953548487, "learning_rate": 5.605011933174224e-07, "logits/chosen": 0.7621825337409973, "logits/rejected": 0.6671872138977051, "logps/accuracies": 1.0, "logps/chosen": -520.250732421875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -421.12890625, "logps/ref_rejected": -388.18841552734375, "logps/rejected": -616.8232421875, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": -4.956088066101074, "rewards/grad_term": 0.0011122592259198427, "rewards/margins": 6.475651741027832, "rewards/rejected": -11.431740760803223, "step": 262 }, { "epoch": 0.5634708087841457, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.615576943912552, "learning_rate": 5.596658711217183e-07, "logits/chosen": 0.7528675198554993, "logits/rejected": 0.5586297512054443, "logps/accuracies": 1.0, "logps/chosen": -235.1770477294922, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -193.3452606201172, "logps/ref_rejected": -156.8057861328125, "logps/rejected": -284.637939453125, "loss": 0.1786, "rewards/accuracies": 0.75, "rewards/chosen": -2.0915887355804443, "rewards/grad_term": 0.012236223556101322, "rewards/margins": 4.300019264221191, "rewards/rejected": -6.391608238220215, "step": 263 }, { "epoch": 0.5656132833422604, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 5.642389085683032, "learning_rate": 5.588305489260142e-07, "logits/chosen": 0.7206822633743286, "logits/rejected": 0.6255587339401245, "logps/accuracies": 1.0, "logps/chosen": -403.0487976074219, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -323.03924560546875, "logps/ref_rejected": -263.4703369140625, "logps/rejected": -451.4239196777344, "loss": 0.1584, "rewards/accuracies": 0.75, "rewards/chosen": -4.000478267669678, "rewards/grad_term": 0.009540688246488571, "rewards/margins": 5.3972015380859375, "rewards/rejected": -9.397679328918457, "step": 264 }, { "epoch": 0.5677557579003749, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 14.507972697356939, "learning_rate": 5.579952267303103e-07, "logits/chosen": 0.8435995578765869, "logits/rejected": 0.30664098262786865, "logps/accuracies": 0.25, "logps/chosen": -513.014404296875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -417.4500427246094, "logps/ref_rejected": -269.4376220703125, "logps/rejected": -427.7491760253906, "loss": 0.2097, "rewards/accuracies": 0.75, "rewards/chosen": -4.7782206535339355, "rewards/grad_term": 0.010889217257499695, "rewards/margins": 3.137356996536255, "rewards/rejected": -7.9155778884887695, "step": 265 }, { "epoch": 0.5698982324584896, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.002273528640038, "learning_rate": 5.571599045346062e-07, "logits/chosen": 0.6676109433174133, "logits/rejected": 0.7191418409347534, "logps/accuracies": 0.75, "logps/chosen": -496.98822021484375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -340.63250732421875, "logps/ref_rejected": -327.24847412109375, "logps/rejected": -582.9611206054688, "loss": 0.1535, "rewards/accuracies": 0.75, "rewards/chosen": -7.817786693572998, "rewards/grad_term": 0.011708484031260014, "rewards/margins": 4.9678449630737305, "rewards/rejected": -12.785632133483887, "step": 266 }, { "epoch": 0.5720407070166041, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.543702357825273, "learning_rate": 5.563245823389021e-07, "logits/chosen": 0.8063835501670837, "logits/rejected": 0.6988131999969482, "logps/accuracies": 0.75, "logps/chosen": -403.5315856933594, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -330.3784484863281, "logps/ref_rejected": -266.2269287109375, "logps/rejected": -429.4187316894531, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": -3.6576569080352783, "rewards/grad_term": 0.006236384157091379, "rewards/margins": 4.501932621002197, "rewards/rejected": -8.159589767456055, "step": 267 }, { "epoch": 0.5741831815747188, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.10211268941143, "learning_rate": 5.55489260143198e-07, "logits/chosen": 0.6175810694694519, "logits/rejected": 0.4528239965438843, "logps/accuracies": 0.75, "logps/chosen": -391.7451477050781, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -323.01617431640625, "logps/ref_rejected": -281.732421875, "logps/rejected": -443.7324523925781, "loss": 0.1603, "rewards/accuracies": 1.0, "rewards/chosen": -3.4364490509033203, "rewards/grad_term": 0.003603234887123108, "rewards/margins": 4.663552284240723, "rewards/rejected": -8.100001335144043, "step": 268 }, { "epoch": 0.5763256561328334, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.454583306377835, "learning_rate": 5.546539379474941e-07, "logits/chosen": 0.5018086433410645, "logits/rejected": 0.3208431601524353, "logps/accuracies": 0.75, "logps/chosen": -292.32537841796875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -235.02024841308594, "logps/ref_rejected": -239.4210968017578, "logps/rejected": -393.904541015625, "loss": 0.1477, "rewards/accuracies": 1.0, "rewards/chosen": -2.8652560710906982, "rewards/grad_term": 0.005141068249940872, "rewards/margins": 4.858916282653809, "rewards/rejected": -7.724172115325928, "step": 269 }, { "epoch": 0.578468130690948, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.116562665988194, "learning_rate": 5.5381861575179e-07, "logits/chosen": 0.8009479641914368, "logits/rejected": 0.5304053425788879, "logps/accuracies": 0.75, "logps/chosen": -567.357666015625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -447.6595458984375, "logps/ref_rejected": -327.67388916015625, "logps/rejected": -578.609619140625, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": -5.984908580780029, "rewards/grad_term": 0.001012351829558611, "rewards/margins": 6.561877727508545, "rewards/rejected": -12.546786308288574, "step": 270 }, { "epoch": 0.5806106052490627, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.286921940219483, "learning_rate": 5.529832935560859e-07, "logits/chosen": 0.8755187392234802, "logits/rejected": 0.7794501781463623, "logps/accuracies": 0.75, "logps/chosen": -271.8689880371094, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -215.5362548828125, "logps/ref_rejected": -208.42568969726562, "logps/rejected": -326.4412536621094, "loss": 0.1667, "rewards/accuracies": 1.0, "rewards/chosen": -2.8166375160217285, "rewards/grad_term": 0.006757371127605438, "rewards/margins": 3.084141254425049, "rewards/rejected": -5.900778770446777, "step": 271 }, { "epoch": 0.5827530798071773, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.808675787712881, "learning_rate": 5.521479713603818e-07, "logits/chosen": 0.5810420513153076, "logits/rejected": 0.5697520971298218, "logps/accuracies": 0.75, "logps/chosen": -252.84695434570312, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -222.0087890625, "logps/ref_rejected": -220.40602111816406, "logps/rejected": -321.25408935546875, "loss": 0.1784, "rewards/accuracies": 1.0, "rewards/chosen": -1.5419079065322876, "rewards/grad_term": 0.006449728738516569, "rewards/margins": 3.500495433807373, "rewards/rejected": -5.042403697967529, "step": 272 }, { "epoch": 0.584895554365292, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.461051225414454, "learning_rate": 5.513126491646778e-07, "logits/chosen": 0.6986079812049866, "logits/rejected": 0.7241477370262146, "logps/accuracies": 1.0, "logps/chosen": -478.98333740234375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -346.74542236328125, "logps/ref_rejected": -319.70550537109375, "logps/rejected": -641.8858642578125, "loss": 0.1217, "rewards/accuracies": 1.0, "rewards/chosen": -6.61189603805542, "rewards/grad_term": 0.004527165554463863, "rewards/margins": 9.497122764587402, "rewards/rejected": -16.109020233154297, "step": 273 }, { "epoch": 0.5870380289234065, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.28604619396496, "learning_rate": 5.504773269689737e-07, "logits/chosen": 0.9160727262496948, "logits/rejected": 0.6226189732551575, "logps/accuracies": 0.75, "logps/chosen": -482.9158630371094, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -380.3866882324219, "logps/ref_rejected": -318.1195068359375, "logps/rejected": -563.2010498046875, "loss": 0.1824, "rewards/accuracies": 1.0, "rewards/chosen": -5.126457691192627, "rewards/grad_term": 0.0009991895640268922, "rewards/margins": 7.12761926651001, "rewards/rejected": -12.254076957702637, "step": 274 }, { "epoch": 0.5891805034815212, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.96745166065974, "learning_rate": 5.496420047732696e-07, "logits/chosen": 0.7636332511901855, "logits/rejected": 0.7924087643623352, "logps/accuracies": 0.75, "logps/chosen": -232.80300903320312, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -201.30563354492188, "logps/ref_rejected": -200.0108184814453, "logps/rejected": -281.12127685546875, "loss": 0.19, "rewards/accuracies": 0.75, "rewards/chosen": -1.5748703479766846, "rewards/grad_term": 0.011370973661541939, "rewards/margins": 2.480652332305908, "rewards/rejected": -4.055522918701172, "step": 275 }, { "epoch": 0.5913229780396357, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.556600598116974, "learning_rate": 5.488066825775657e-07, "logits/chosen": 0.3400154113769531, "logits/rejected": 0.8462868332862854, "logps/accuracies": 1.0, "logps/chosen": -457.0469970703125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -380.0611877441406, "logps/ref_rejected": -327.03466796875, "logps/rejected": -536.0282592773438, "loss": 0.171, "rewards/accuracies": 1.0, "rewards/chosen": -3.8492913246154785, "rewards/grad_term": 0.0008406995330005884, "rewards/margins": 6.6003899574279785, "rewards/rejected": -10.449681282043457, "step": 276 }, { "epoch": 0.5934654525977504, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.78258861657989, "learning_rate": 5.479713603818616e-07, "logits/chosen": 0.5925794839859009, "logits/rejected": 0.3707428276538849, "logps/accuracies": 0.5, "logps/chosen": -392.9906921386719, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -328.11785888671875, "logps/ref_rejected": -267.7380676269531, "logps/rejected": -379.7018737792969, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": -3.243643283843994, "rewards/grad_term": 0.007178822532296181, "rewards/margins": 2.354548215866089, "rewards/rejected": -5.598191261291504, "step": 277 }, { "epoch": 0.595607927155865, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.592007769382429, "learning_rate": 5.471360381861575e-07, "logits/chosen": 0.6001948714256287, "logits/rejected": 0.4797150790691376, "logps/accuracies": 0.75, "logps/chosen": -329.23004150390625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -272.231201171875, "logps/ref_rejected": -260.5545349121094, "logps/rejected": -435.8937072753906, "loss": 0.201, "rewards/accuracies": 1.0, "rewards/chosen": -2.8499412536621094, "rewards/grad_term": 0.005573004484176636, "rewards/margins": 5.917016506195068, "rewards/rejected": -8.766958236694336, "step": 278 }, { "epoch": 0.5977504017139796, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.70464881667517, "learning_rate": 5.463007159904534e-07, "logits/chosen": 0.9618555307388306, "logits/rejected": 0.7959021329879761, "logps/accuracies": 0.75, "logps/chosen": -419.58148193359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -360.5795593261719, "logps/ref_rejected": -305.4864196777344, "logps/rejected": -494.42156982421875, "loss": 0.1709, "rewards/accuracies": 1.0, "rewards/chosen": -2.9500961303710938, "rewards/grad_term": 0.002942422404885292, "rewards/margins": 6.496662616729736, "rewards/rejected": -9.446758270263672, "step": 279 }, { "epoch": 0.5998928762720943, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 13.531014928298436, "learning_rate": 5.454653937947494e-07, "logits/chosen": 0.5968809127807617, "logits/rejected": 0.6607197523117065, "logps/accuracies": 0.75, "logps/chosen": -369.2514343261719, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -323.9585266113281, "logps/ref_rejected": -320.01959228515625, "logps/rejected": -448.68682861328125, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": -2.2646453380584717, "rewards/grad_term": 0.004641966428607702, "rewards/margins": 4.168717861175537, "rewards/rejected": -6.433363437652588, "step": 280 }, { "epoch": 0.6020353508302089, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.57138907693324, "learning_rate": 5.446300715990454e-07, "logits/chosen": 1.1103699207305908, "logits/rejected": 0.9430161714553833, "logps/accuracies": 0.5, "logps/chosen": -314.4698486328125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -291.67144775390625, "logps/ref_rejected": -255.3843994140625, "logps/rejected": -313.588134765625, "loss": 0.1767, "rewards/accuracies": 1.0, "rewards/chosen": -1.139919638633728, "rewards/grad_term": 0.011935700662434101, "rewards/margins": 1.7702679634094238, "rewards/rejected": -2.9101874828338623, "step": 281 }, { "epoch": 0.6041778253883235, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.587776679123657, "learning_rate": 5.437947494033412e-07, "logits/chosen": 0.8917209506034851, "logits/rejected": 0.7313340306282043, "logps/accuracies": 0.5, "logps/chosen": -615.8944702148438, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -534.5302734375, "logps/ref_rejected": -437.1783447265625, "logps/rejected": -592.5615844726562, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": -4.068208694458008, "rewards/grad_term": 0.002287252340465784, "rewards/margins": 3.7009527683258057, "rewards/rejected": -7.769161701202393, "step": 282 }, { "epoch": 0.6063202999464381, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.603440187653598, "learning_rate": 5.429594272076372e-07, "logits/chosen": 0.8125787377357483, "logits/rejected": 0.5284969806671143, "logps/accuracies": 0.75, "logps/chosen": -394.273681640625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -301.3934326171875, "logps/ref_rejected": -264.20989990234375, "logps/rejected": -396.23590087890625, "loss": 0.1553, "rewards/accuracies": 1.0, "rewards/chosen": -4.644012451171875, "rewards/grad_term": 0.012798898853361607, "rewards/margins": 1.9572882652282715, "rewards/rejected": -6.601301193237305, "step": 283 }, { "epoch": 0.6084627745045528, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.706538315869954, "learning_rate": 5.421241050119332e-07, "logits/chosen": 0.6700544357299805, "logits/rejected": 0.6845322847366333, "logps/accuracies": 1.0, "logps/chosen": -373.3970031738281, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -308.53228759765625, "logps/ref_rejected": -321.16717529296875, "logps/rejected": -476.41796875, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": -3.2432353496551514, "rewards/grad_term": 0.002791226841509342, "rewards/margins": 4.5193047523498535, "rewards/rejected": -7.762540340423584, "step": 284 }, { "epoch": 0.6106052490626673, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.616290838954344, "learning_rate": 5.412887828162291e-07, "logits/chosen": 0.8290910720825195, "logits/rejected": 0.931686520576477, "logps/accuracies": 1.0, "logps/chosen": -446.44232177734375, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -383.1173400878906, "logps/ref_rejected": -422.9083557128906, "logps/rejected": -596.1305541992188, "loss": 0.1758, "rewards/accuracies": 1.0, "rewards/chosen": -3.1662492752075195, "rewards/grad_term": 0.005467691924422979, "rewards/margins": 5.49485969543457, "rewards/rejected": -8.66110897064209, "step": 285 }, { "epoch": 0.612747723620782, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.608396954924636, "learning_rate": 5.40453460620525e-07, "logits/chosen": 0.45172828435897827, "logits/rejected": 0.7064520120620728, "logps/accuracies": 1.0, "logps/chosen": -322.91387939453125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -258.16265869140625, "logps/ref_rejected": -297.7436218261719, "logps/rejected": -475.2723388671875, "loss": 0.1969, "rewards/accuracies": 1.0, "rewards/chosen": -3.237560272216797, "rewards/grad_term": 0.005947392899543047, "rewards/margins": 5.638874530792236, "rewards/rejected": -8.876434326171875, "step": 286 }, { "epoch": 0.6148901981788967, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.779231877934484, "learning_rate": 5.39618138424821e-07, "logits/chosen": 0.9248701333999634, "logits/rejected": 0.8036876916885376, "logps/accuracies": 0.5, "logps/chosen": -467.22015380859375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -380.78204345703125, "logps/ref_rejected": -339.566162109375, "logps/rejected": -530.2546997070312, "loss": 0.2517, "rewards/accuracies": 1.0, "rewards/chosen": -4.321907043457031, "rewards/grad_term": 0.002679330063983798, "rewards/margins": 5.212520599365234, "rewards/rejected": -9.534428596496582, "step": 287 }, { "epoch": 0.6170326727370112, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.667742293114383, "learning_rate": 5.38782816229117e-07, "logits/chosen": 0.5336281061172485, "logits/rejected": 0.5964027643203735, "logps/accuracies": 1.0, "logps/chosen": -413.2911376953125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -360.906982421875, "logps/ref_rejected": -325.00140380859375, "logps/rejected": -499.9887390136719, "loss": 0.2106, "rewards/accuracies": 0.75, "rewards/chosen": -2.6192078590393066, "rewards/grad_term": 0.007026966195553541, "rewards/margins": 6.1301589012146, "rewards/rejected": -8.749366760253906, "step": 288 }, { "epoch": 0.6191751472951259, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.794004557760429, "learning_rate": 5.379474940334129e-07, "logits/chosen": 0.5764543414115906, "logits/rejected": 0.49163103103637695, "logps/accuracies": 1.0, "logps/chosen": -346.34478759765625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -279.2290344238281, "logps/ref_rejected": -275.8312072753906, "logps/rejected": -466.09326171875, "loss": 0.1684, "rewards/accuracies": 1.0, "rewards/chosen": -3.3557891845703125, "rewards/grad_term": 0.00030989584047347307, "rewards/margins": 6.157315254211426, "rewards/rejected": -9.513103485107422, "step": 289 }, { "epoch": 0.6213176218532405, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.147575761036771, "learning_rate": 5.371121718377088e-07, "logits/chosen": 0.9949532747268677, "logits/rejected": 0.8375188708305359, "logps/accuracies": 0.75, "logps/chosen": -450.9200134277344, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -368.46990966796875, "logps/ref_rejected": -288.4126281738281, "logps/rejected": -482.5291748046875, "loss": 0.1457, "rewards/accuracies": 1.0, "rewards/chosen": -4.122506141662598, "rewards/grad_term": 0.0010359850712120533, "rewards/margins": 5.583320140838623, "rewards/rejected": -9.705825805664062, "step": 290 }, { "epoch": 0.6234600964113551, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.480445656722806, "learning_rate": 5.362768496420047e-07, "logits/chosen": 0.46533939242362976, "logits/rejected": 0.4300745725631714, "logps/accuracies": 1.0, "logps/chosen": -365.7827453613281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -291.9012145996094, "logps/ref_rejected": -255.63128662109375, "logps/rejected": -422.51983642578125, "loss": 0.1266, "rewards/accuracies": 1.0, "rewards/chosen": -3.6940770149230957, "rewards/grad_term": 0.004254742059856653, "rewards/margins": 4.650350093841553, "rewards/rejected": -8.344427108764648, "step": 291 }, { "epoch": 0.6256025709694697, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.156244989772967, "learning_rate": 5.354415274463007e-07, "logits/chosen": 0.9660448431968689, "logits/rejected": 0.5434409379959106, "logps/accuracies": 0.75, "logps/chosen": -476.6251220703125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -414.31597900390625, "logps/ref_rejected": -342.433837890625, "logps/rejected": -571.8237915039062, "loss": 0.1555, "rewards/accuracies": 0.75, "rewards/chosen": -3.115457534790039, "rewards/grad_term": 0.006694721523672342, "rewards/margins": 8.354040145874023, "rewards/rejected": -11.469497680664062, "step": 292 }, { "epoch": 0.6277450455275844, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 9.41012930540287, "learning_rate": 5.346062052505966e-07, "logits/chosen": 0.9397240877151489, "logits/rejected": 0.7151464223861694, "logps/accuracies": 0.5, "logps/chosen": -548.965576171875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -421.5787353515625, "logps/ref_rejected": -343.83221435546875, "logps/rejected": -598.891357421875, "loss": 0.1439, "rewards/accuracies": 1.0, "rewards/chosen": -6.369344234466553, "rewards/grad_term": 0.0019262685673311353, "rewards/margins": 6.383614540100098, "rewards/rejected": -12.752958297729492, "step": 293 }, { "epoch": 0.629887520085699, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.624072315857759, "learning_rate": 5.337708830548926e-07, "logits/chosen": 0.45759594440460205, "logits/rejected": 0.47171294689178467, "logps/accuracies": 0.5, "logps/chosen": -386.42669677734375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -328.4192199707031, "logps/ref_rejected": -252.86329650878906, "logps/rejected": -425.3404541015625, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": -2.9003729820251465, "rewards/grad_term": 0.0017550851916894317, "rewards/margins": 5.723484039306641, "rewards/rejected": -8.623857498168945, "step": 294 }, { "epoch": 0.6320299946438136, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.713346436229937, "learning_rate": 5.329355608591886e-07, "logits/chosen": 0.7498375177383423, "logits/rejected": 0.6682128310203552, "logps/accuracies": 0.75, "logps/chosen": -446.07830810546875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -382.056884765625, "logps/ref_rejected": -337.1555480957031, "logps/rejected": -528.916015625, "loss": 0.158, "rewards/accuracies": 0.75, "rewards/chosen": -3.201070785522461, "rewards/grad_term": 0.006979628466069698, "rewards/margins": 6.386953830718994, "rewards/rejected": -9.588025093078613, "step": 295 }, { "epoch": 0.6341724692019283, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.364585833613935, "learning_rate": 5.321002386634845e-07, "logits/chosen": 0.7363272905349731, "logits/rejected": 0.6899486780166626, "logps/accuracies": 0.75, "logps/chosen": -605.6181030273438, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -478.4359130859375, "logps/ref_rejected": -392.2603759765625, "logps/rejected": -669.0337524414062, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": -6.359109401702881, "rewards/grad_term": 0.0019621604587882757, "rewards/margins": 7.479557991027832, "rewards/rejected": -13.838666915893555, "step": 296 }, { "epoch": 0.6363149437600428, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.505205942394765, "learning_rate": 5.312649164677804e-07, "logits/chosen": 0.5754671096801758, "logits/rejected": 0.594577431678772, "logps/accuracies": 0.75, "logps/chosen": -347.17852783203125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -265.71417236328125, "logps/ref_rejected": -234.78533935546875, "logps/rejected": -441.4500732421875, "loss": 0.1279, "rewards/accuracies": 1.0, "rewards/chosen": -4.07321834564209, "rewards/grad_term": 0.004567963071167469, "rewards/margins": 6.260017395019531, "rewards/rejected": -10.333235740661621, "step": 297 }, { "epoch": 0.6384574183181575, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 18.711798908689843, "learning_rate": 5.304295942720763e-07, "logits/chosen": 0.870806872844696, "logits/rejected": 0.6316779851913452, "logps/accuracies": 0.75, "logps/chosen": -419.4470520019531, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -333.89263916015625, "logps/ref_rejected": -268.04510498046875, "logps/rejected": -475.2600402832031, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": -4.277721405029297, "rewards/grad_term": 0.0027266484685242176, "rewards/margins": 6.083026885986328, "rewards/rejected": -10.360748291015625, "step": 298 }, { "epoch": 0.6405998928762721, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 13.094138352584997, "learning_rate": 5.295942720763724e-07, "logits/chosen": 0.4675275683403015, "logits/rejected": 0.606252133846283, "logps/accuracies": 0.75, "logps/chosen": -437.78228759765625, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -361.85552978515625, "logps/ref_rejected": -334.0460510253906, "logps/rejected": -510.67462158203125, "loss": 0.162, "rewards/accuracies": 1.0, "rewards/chosen": -3.7963359355926514, "rewards/grad_term": 0.00101923244073987, "rewards/margins": 5.035091876983643, "rewards/rejected": -8.831427574157715, "step": 299 }, { "epoch": 0.6427423674343867, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 1.0, "grad_norm": 10.604360471548478, "learning_rate": 5.287589498806682e-07, "logits/chosen": 0.7601810097694397, "logits/rejected": 0.5254924297332764, "logps/accuracies": 0.0, "logps/chosen": -309.6020202636719, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -279.2956848144531, "logps/ref_rejected": -154.2837371826172, "logps/rejected": -275.95166015625, "loss": 0.1947, "rewards/accuracies": 1.0, "rewards/chosen": -1.5153169631958008, "rewards/grad_term": 0.00842749048024416, "rewards/margins": 4.56807804107666, "rewards/rejected": -6.083395004272461, "step": 300 }, { "epoch": 0.6448848419925013, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 16.672250976383555, "learning_rate": 5.279236276849642e-07, "logits/chosen": 1.0493147373199463, "logits/rejected": 0.5671635270118713, "logps/accuracies": 0.75, "logps/chosen": -357.0266418457031, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -274.6451721191406, "logps/ref_rejected": -204.9969482421875, "logps/rejected": -309.4927978515625, "loss": 0.2159, "rewards/accuracies": 0.75, "rewards/chosen": -4.119072914123535, "rewards/grad_term": 0.014873827807605267, "rewards/margins": 1.1057183742523193, "rewards/rejected": -5.224791049957275, "step": 301 }, { "epoch": 0.647027316550616, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.772181415056538, "learning_rate": 5.270883054892601e-07, "logits/chosen": 0.6905862092971802, "logits/rejected": 0.6120530366897583, "logps/accuracies": 1.0, "logps/chosen": -301.63970947265625, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -240.38360595703125, "logps/ref_rejected": -259.4972839355469, "logps/rejected": -422.0617370605469, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": -3.0628066062927246, "rewards/grad_term": 0.002045161323621869, "rewards/margins": 5.065417289733887, "rewards/rejected": -8.128223419189453, "step": 302 }, { "epoch": 0.6491697911087306, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.52033177772429, "learning_rate": 5.262529832935561e-07, "logits/chosen": 0.5544182658195496, "logits/rejected": 0.45686784386634827, "logps/accuracies": 1.0, "logps/chosen": -278.7823181152344, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -196.63429260253906, "logps/ref_rejected": -185.2466583251953, "logps/rejected": -335.3304443359375, "loss": 0.1615, "rewards/accuracies": 1.0, "rewards/chosen": -4.1074018478393555, "rewards/grad_term": 0.003845647443085909, "rewards/margins": 3.3967883586883545, "rewards/rejected": -7.504190444946289, "step": 303 }, { "epoch": 0.6513122656668452, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.017492432602673, "learning_rate": 5.25417661097852e-07, "logits/chosen": 0.8323764204978943, "logits/rejected": 0.7166695594787598, "logps/accuracies": 1.0, "logps/chosen": -524.2388916015625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -408.53387451171875, "logps/ref_rejected": -332.67181396484375, "logps/rejected": -610.9110717773438, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": -5.785252571105957, "rewards/grad_term": 8.712082490092143e-05, "rewards/margins": 8.12671184539795, "rewards/rejected": -13.911964416503906, "step": 304 }, { "epoch": 0.6534547402249599, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.355578424387678, "learning_rate": 5.245823389021479e-07, "logits/chosen": 0.6990536451339722, "logits/rejected": 0.6334518790245056, "logps/accuracies": 0.75, "logps/chosen": -439.53704833984375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -357.226318359375, "logps/ref_rejected": -357.77783203125, "logps/rejected": -587.9480590820312, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": -4.115536689758301, "rewards/grad_term": 7.772783283144236e-05, "rewards/margins": 7.392976760864258, "rewards/rejected": -11.508513450622559, "step": 305 }, { "epoch": 0.6555972147830744, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.205005797383153, "learning_rate": 5.237470167064439e-07, "logits/chosen": 0.8662493228912354, "logits/rejected": 0.800337553024292, "logps/accuracies": 0.75, "logps/chosen": -347.75860595703125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -291.16888427734375, "logps/ref_rejected": -224.49195861816406, "logps/rejected": -400.5769958496094, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": -2.829484701156616, "rewards/grad_term": 0.004376052878797054, "rewards/margins": 5.974765777587891, "rewards/rejected": -8.804250717163086, "step": 306 }, { "epoch": 0.6577396893411891, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 13.319843033839788, "learning_rate": 5.229116945107398e-07, "logits/chosen": 0.6567318439483643, "logits/rejected": 0.7602465748786926, "logps/accuracies": 1.0, "logps/chosen": -388.985107421875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -338.995361328125, "logps/ref_rejected": -395.92230224609375, "logps/rejected": -499.59552001953125, "loss": 0.2511, "rewards/accuracies": 1.0, "rewards/chosen": -2.4994890689849854, "rewards/grad_term": 0.0039435261860489845, "rewards/margins": 2.684171438217163, "rewards/rejected": -5.183660984039307, "step": 307 }, { "epoch": 0.6598821638993037, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.066998414831264, "learning_rate": 5.220763723150358e-07, "logits/chosen": 0.8940625786781311, "logits/rejected": 0.903926432132721, "logps/accuracies": 1.0, "logps/chosen": -430.4771423339844, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -328.47900390625, "logps/ref_rejected": -382.0928955078125, "logps/rejected": -580.9340209960938, "loss": 0.1645, "rewards/accuracies": 1.0, "rewards/chosen": -5.099907875061035, "rewards/grad_term": 0.0023625774774700403, "rewards/margins": 4.842148780822754, "rewards/rejected": -9.942056655883789, "step": 308 }, { "epoch": 0.6620246384574183, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.765055407846113, "learning_rate": 5.212410501193317e-07, "logits/chosen": 0.7993382215499878, "logits/rejected": 0.6162198185920715, "logps/accuracies": 0.75, "logps/chosen": -455.43585205078125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -387.1122741699219, "logps/ref_rejected": -310.28033447265625, "logps/rejected": -487.04913330078125, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": -3.4161789417266846, "rewards/grad_term": 0.0006664180546067655, "rewards/margins": 5.422262668609619, "rewards/rejected": -8.838441848754883, "step": 309 }, { "epoch": 0.664167113015533, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.526420429572342, "learning_rate": 5.204057279236276e-07, "logits/chosen": 0.6536291241645813, "logits/rejected": 0.7170487642288208, "logps/accuracies": 1.0, "logps/chosen": -343.4113464355469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -269.9009704589844, "logps/ref_rejected": -301.39697265625, "logps/rejected": -437.5945129394531, "loss": 0.1736, "rewards/accuracies": 1.0, "rewards/chosen": -3.6755175590515137, "rewards/grad_term": 0.004929536487907171, "rewards/margins": 3.1343586444854736, "rewards/rejected": -6.809875965118408, "step": 310 }, { "epoch": 0.6663095875736476, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.317599300319795, "learning_rate": 5.195704057279236e-07, "logits/chosen": 0.7689659595489502, "logits/rejected": 0.5763236284255981, "logps/accuracies": 0.75, "logps/chosen": -359.8375244140625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -294.6138916015625, "logps/ref_rejected": -236.17083740234375, "logps/rejected": -379.31927490234375, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": -3.2611827850341797, "rewards/grad_term": 0.003922306001186371, "rewards/margins": 3.896237850189209, "rewards/rejected": -7.157420635223389, "step": 311 }, { "epoch": 0.6684520621317622, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.491312588715147, "learning_rate": 5.187350835322196e-07, "logits/chosen": 0.6590454578399658, "logits/rejected": 0.3621766269207001, "logps/accuracies": 0.75, "logps/chosen": -279.87823486328125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -219.8527374267578, "logps/ref_rejected": -189.44700622558594, "logps/rejected": -309.5084533691406, "loss": 0.2011, "rewards/accuracies": 0.75, "rewards/chosen": -3.001275062561035, "rewards/grad_term": 0.0161147378385067, "rewards/margins": 3.0017971992492676, "rewards/rejected": -6.003072261810303, "step": 312 }, { "epoch": 0.6705945366898768, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.580563906265365, "learning_rate": 5.178997613365155e-07, "logits/chosen": 0.8489370942115784, "logits/rejected": 0.7335869073867798, "logps/accuracies": 0.75, "logps/chosen": -423.52874755859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -348.4372863769531, "logps/ref_rejected": -324.8551025390625, "logps/rejected": -563.6224975585938, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": -3.754572629928589, "rewards/grad_term": 0.00203010905534029, "rewards/margins": 8.183797836303711, "rewards/rejected": -11.938370704650879, "step": 313 }, { "epoch": 0.6727370112479915, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 8.743399053571068, "learning_rate": 5.170644391408115e-07, "logits/chosen": 0.940852165222168, "logits/rejected": 0.7845810651779175, "logps/accuracies": 0.25, "logps/chosen": -512.4435424804688, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -435.47698974609375, "logps/ref_rejected": -288.2704162597656, "logps/rejected": -490.13836669921875, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": -3.84832763671875, "rewards/grad_term": 0.00016607397992629558, "rewards/margins": 6.24506950378418, "rewards/rejected": -10.09339714050293, "step": 314 }, { "epoch": 0.674879485806106, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.878420297933516, "learning_rate": 5.162291169451074e-07, "logits/chosen": 0.7630524039268494, "logits/rejected": 0.6900883913040161, "logps/accuracies": 0.75, "logps/chosen": -469.132080078125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -356.17041015625, "logps/ref_rejected": -346.5165100097656, "logps/rejected": -575.3904418945312, "loss": 0.1708, "rewards/accuracies": 1.0, "rewards/chosen": -5.648085594177246, "rewards/grad_term": 0.0032002755906432867, "rewards/margins": 5.795612335205078, "rewards/rejected": -11.443696975708008, "step": 315 }, { "epoch": 0.6770219603642207, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.731907474004624, "learning_rate": 5.153937947494033e-07, "logits/chosen": 0.5569190979003906, "logits/rejected": 0.6198952794075012, "logps/accuracies": 1.0, "logps/chosen": -371.6037292480469, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -293.22381591796875, "logps/ref_rejected": -300.0587463378906, "logps/rejected": -534.5460205078125, "loss": 0.1858, "rewards/accuracies": 1.0, "rewards/chosen": -3.9189953804016113, "rewards/grad_term": 0.00014798434858676046, "rewards/margins": 7.8053669929504395, "rewards/rejected": -11.72436237335205, "step": 316 }, { "epoch": 0.6791644349223352, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.47786260702623, "learning_rate": 5.145584725536993e-07, "logits/chosen": 0.8376766443252563, "logits/rejected": 0.7569788098335266, "logps/accuracies": 0.75, "logps/chosen": -496.47161865234375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -384.5050048828125, "logps/ref_rejected": -329.0260925292969, "logps/rejected": -620.14892578125, "loss": 0.1397, "rewards/accuracies": 1.0, "rewards/chosen": -5.598330497741699, "rewards/grad_term": 0.0014825062826275826, "rewards/margins": 8.95781135559082, "rewards/rejected": -14.556142807006836, "step": 317 }, { "epoch": 0.6813069094804499, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.306648680307601, "learning_rate": 5.137231503579952e-07, "logits/chosen": 0.6440654993057251, "logits/rejected": 0.5563719868659973, "logps/accuracies": 1.0, "logps/chosen": -464.43914794921875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -345.64239501953125, "logps/ref_rejected": -302.1645202636719, "logps/rejected": -522.718505859375, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": -5.939836502075195, "rewards/grad_term": 0.003235024632886052, "rewards/margins": 5.087862968444824, "rewards/rejected": -11.02769947052002, "step": 318 }, { "epoch": 0.6834493840385646, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.209754726841052, "learning_rate": 5.128878281622912e-07, "logits/chosen": 0.7397335171699524, "logits/rejected": 0.6839704513549805, "logps/accuracies": 0.75, "logps/chosen": -304.3392639160156, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -254.27145385742188, "logps/ref_rejected": -232.0565948486328, "logps/rejected": -400.621337890625, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": -2.503389835357666, "rewards/grad_term": 0.005540979094803333, "rewards/margins": 5.924847602844238, "rewards/rejected": -8.428237915039062, "step": 319 }, { "epoch": 0.6855918585966791, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.033781332110546, "learning_rate": 5.120525059665871e-07, "logits/chosen": 0.7168871164321899, "logits/rejected": 0.7381715774536133, "logps/accuracies": 1.0, "logps/chosen": -508.3505859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -435.618896484375, "logps/ref_rejected": -405.5649719238281, "logps/rejected": -566.0382080078125, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": -3.636585235595703, "rewards/grad_term": 0.0022972766309976578, "rewards/margins": 4.387078762054443, "rewards/rejected": -8.023663520812988, "step": 320 }, { "epoch": 0.6855918585966791, "eval_flips/correct->correct": 0.14000000059604645, "eval_flips/correct->incorrect": 0.019999999552965164, "eval_flips/incorrect->correct": 0.5400000214576721, "eval_flips/incorrect->incorrect": 0.30000001192092896, "eval_logits/chosen": 0.7168383002281189, "eval_logits/rejected": 0.599204957485199, "eval_logps/accuracies": 0.6800000071525574, "eval_logps/chosen": -391.7984619140625, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -410.0682678222656, "eval_loss": 0.17036853730678558, "eval_rewards/accuracies": 0.8999999761581421, "eval_rewards/chosen": -3.414141893386841, "eval_rewards/grad_term": 0.007643704302608967, "eval_rewards/margins": 4.154223442077637, "eval_rewards/rejected": -7.568365097045898, "eval_runtime": 374.585, "eval_samples_per_second": 4.218, "eval_steps_per_second": 0.133, "step": 320 }, { "epoch": 0.6877343331547938, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.190622967503248, "learning_rate": 5.11217183770883e-07, "logits/chosen": 0.8069337606430054, "logits/rejected": 0.7580575942993164, "logps/accuracies": 1.0, "logps/chosen": -503.8867492675781, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -380.65765380859375, "logps/ref_rejected": -327.27337646484375, "logps/rejected": -596.4686279296875, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": -6.1614532470703125, "rewards/grad_term": 0.00022571110457647592, "rewards/margins": 7.298309326171875, "rewards/rejected": -13.459762573242188, "step": 321 }, { "epoch": 0.6898768077129084, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 13.881500506001192, "learning_rate": 5.10381861575179e-07, "logits/chosen": 0.9965633749961853, "logits/rejected": 0.7457981109619141, "logps/accuracies": 0.75, "logps/chosen": -474.2216491699219, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -421.9150390625, "logps/ref_rejected": -353.48065185546875, "logps/rejected": -548.97265625, "loss": 0.1638, "rewards/accuracies": 1.0, "rewards/chosen": -2.6153299808502197, "rewards/grad_term": 0.0007248412584885955, "rewards/margins": 7.159272193908691, "rewards/rejected": -9.774601936340332, "step": 322 }, { "epoch": 0.692019282271023, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.643673103869992, "learning_rate": 5.095465393794749e-07, "logits/chosen": 0.49757474660873413, "logits/rejected": 0.41455477476119995, "logps/accuracies": 0.5, "logps/chosen": -357.2688293457031, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -293.4722900390625, "logps/ref_rejected": -242.71273803710938, "logps/rejected": -381.94488525390625, "loss": 0.1431, "rewards/accuracies": 1.0, "rewards/chosen": -3.1898274421691895, "rewards/grad_term": 0.0033772799652069807, "rewards/margins": 3.771780490875244, "rewards/rejected": -6.961607933044434, "step": 323 }, { "epoch": 0.6941617568291376, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.731140676983227, "learning_rate": 5.087112171837709e-07, "logits/chosen": 0.8970646858215332, "logits/rejected": 0.7831761240959167, "logps/accuracies": 1.0, "logps/chosen": -555.628173828125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -494.30938720703125, "logps/ref_rejected": -413.06451416015625, "logps/rejected": -587.2745971679688, "loss": 0.1652, "rewards/accuracies": 1.0, "rewards/chosen": -3.065938711166382, "rewards/grad_term": 0.002578580752015114, "rewards/margins": 5.644565582275391, "rewards/rejected": -8.710504531860352, "step": 324 }, { "epoch": 0.6963042313872523, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.5010699802203655, "learning_rate": 5.078758949880667e-07, "logits/chosen": 0.8070268630981445, "logits/rejected": 0.6490368247032166, "logps/accuracies": 1.0, "logps/chosen": -577.15478515625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -460.09661865234375, "logps/ref_rejected": -362.884033203125, "logps/rejected": -633.048095703125, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": -5.852906227111816, "rewards/grad_term": 0.00013052637223154306, "rewards/margins": 7.655299186706543, "rewards/rejected": -13.50820541381836, "step": 325 }, { "epoch": 0.698446705945367, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.546015605698586, "learning_rate": 5.070405727923628e-07, "logits/chosen": 0.5231800079345703, "logits/rejected": 0.4889012575149536, "logps/accuracies": 0.75, "logps/chosen": -264.241455078125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -208.79318237304688, "logps/ref_rejected": -202.1544189453125, "logps/rejected": -345.9344787597656, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": -2.7724146842956543, "rewards/grad_term": 0.007728134281933308, "rewards/margins": 4.41658878326416, "rewards/rejected": -7.189002990722656, "step": 326 }, { "epoch": 0.7005891805034815, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 12.648714009679955, "learning_rate": 5.062052505966587e-07, "logits/chosen": 0.7718223333358765, "logits/rejected": 0.7055037021636963, "logps/accuracies": 0.75, "logps/chosen": -459.9751892089844, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -358.0395812988281, "logps/ref_rejected": -338.55364990234375, "logps/rejected": -556.436279296875, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": -5.096780776977539, "rewards/grad_term": 0.0066120820119977, "rewards/margins": 5.797348976135254, "rewards/rejected": -10.89413070678711, "step": 327 }, { "epoch": 0.7027316550615962, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.30205872449101, "learning_rate": 5.053699284009546e-07, "logits/chosen": 0.6625626683235168, "logits/rejected": 0.6732759475708008, "logps/accuracies": 0.75, "logps/chosen": -315.85107421875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -257.46136474609375, "logps/ref_rejected": -273.6905212402344, "logps/rejected": -384.72711181640625, "loss": 0.1767, "rewards/accuracies": 1.0, "rewards/chosen": -2.919485092163086, "rewards/grad_term": 0.007900861091911793, "rewards/margins": 2.6323447227478027, "rewards/rejected": -5.551829814910889, "step": 328 }, { "epoch": 0.7048741296197107, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.156605830327571, "learning_rate": 5.045346062052505e-07, "logits/chosen": 0.7717033624649048, "logits/rejected": 0.6601508855819702, "logps/accuracies": 1.0, "logps/chosen": -463.640380859375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -382.90325927734375, "logps/ref_rejected": -358.9565734863281, "logps/rejected": -566.164306640625, "loss": 0.1797, "rewards/accuracies": 1.0, "rewards/chosen": -4.036858558654785, "rewards/grad_term": 0.0012353091733530164, "rewards/margins": 6.323529243469238, "rewards/rejected": -10.360387802124023, "step": 329 }, { "epoch": 0.7070166041778254, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.641900480934386, "learning_rate": 5.036992840095465e-07, "logits/chosen": 0.6714078187942505, "logits/rejected": 0.5740436315536499, "logps/accuracies": 1.0, "logps/chosen": -453.4899597167969, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -362.24810791015625, "logps/ref_rejected": -388.57666015625, "logps/rejected": -607.985595703125, "loss": 0.1501, "rewards/accuracies": 0.75, "rewards/chosen": -4.562093257904053, "rewards/grad_term": 0.011679948307573795, "rewards/margins": 6.408352851867676, "rewards/rejected": -10.97044563293457, "step": 330 }, { "epoch": 0.70915907873594, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.768091277395346, "learning_rate": 5.028639618138425e-07, "logits/chosen": 0.4849713146686554, "logits/rejected": 0.3761657476425171, "logps/accuracies": 0.75, "logps/chosen": -472.632080078125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -394.3712158203125, "logps/ref_rejected": -320.7559814453125, "logps/rejected": -576.399658203125, "loss": 0.1809, "rewards/accuracies": 1.0, "rewards/chosen": -3.9130444526672363, "rewards/grad_term": 0.0002858239458873868, "rewards/margins": 8.869141578674316, "rewards/rejected": -12.782186508178711, "step": 331 }, { "epoch": 0.7113015532940546, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.361339079624875, "learning_rate": 5.020286396181383e-07, "logits/chosen": 0.7538321614265442, "logits/rejected": 0.619208574295044, "logps/accuracies": 0.5, "logps/chosen": -538.201416015625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -406.4328308105469, "logps/ref_rejected": -341.9276123046875, "logps/rejected": -551.7299194335938, "loss": 0.1106, "rewards/accuracies": 0.75, "rewards/chosen": -6.588429927825928, "rewards/grad_term": 0.009247011505067348, "rewards/margins": 3.9016823768615723, "rewards/rejected": -10.4901123046875, "step": 332 }, { "epoch": 0.7134440278521692, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.211461201062549, "learning_rate": 5.011933174224344e-07, "logits/chosen": 0.9981447458267212, "logits/rejected": 0.6901638507843018, "logps/accuracies": 0.75, "logps/chosen": -473.5837707519531, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -413.8009948730469, "logps/ref_rejected": -332.0254211425781, "logps/rejected": -507.0432434082031, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": -2.9891393184661865, "rewards/grad_term": 0.0005725694936700165, "rewards/margins": 5.761752605438232, "rewards/rejected": -8.75089168548584, "step": 333 }, { "epoch": 0.7155865024102839, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.170697657476269, "learning_rate": 5.003579952267303e-07, "logits/chosen": 0.826168954372406, "logits/rejected": 0.6049452424049377, "logps/accuracies": 0.5, "logps/chosen": -268.6787109375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -210.28704833984375, "logps/ref_rejected": -152.79354858398438, "logps/rejected": -252.23291015625, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": -2.919583797454834, "rewards/grad_term": 0.009706183336675167, "rewards/margins": 2.0523836612701416, "rewards/rejected": -4.9719672203063965, "step": 334 }, { "epoch": 0.7177289769683985, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.060208207763219, "learning_rate": 4.995226730310263e-07, "logits/chosen": 0.8894015550613403, "logits/rejected": 0.6564974784851074, "logps/accuracies": 1.0, "logps/chosen": -500.45635986328125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -414.86962890625, "logps/ref_rejected": -351.6375732421875, "logps/rejected": -605.1538696289062, "loss": 0.1278, "rewards/accuracies": 1.0, "rewards/chosen": -4.2793378829956055, "rewards/grad_term": 0.0034387409687042236, "rewards/margins": 8.396476745605469, "rewards/rejected": -12.675814628601074, "step": 335 }, { "epoch": 0.7198714515265131, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.25, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.56000560457653, "learning_rate": 4.986873508353221e-07, "logits/chosen": 0.6553113460540771, "logits/rejected": 0.6264476180076599, "logps/accuracies": 0.75, "logps/chosen": -260.0503234863281, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -188.68035888671875, "logps/ref_rejected": -198.7422332763672, "logps/rejected": -327.9527282714844, "loss": 0.157, "rewards/accuracies": 0.75, "rewards/chosen": -3.5684990882873535, "rewards/grad_term": 0.014257272705435753, "rewards/margins": 2.8920247554779053, "rewards/rejected": -6.46052360534668, "step": 336 }, { "epoch": 0.7220139260846278, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.247249685460771, "learning_rate": 4.978520286396182e-07, "logits/chosen": 0.8244426250457764, "logits/rejected": 0.6587880849838257, "logps/accuracies": 0.75, "logps/chosen": -550.476318359375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -413.40692138671875, "logps/ref_rejected": -312.9790954589844, "logps/rejected": -601.8764038085938, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": -6.853466987609863, "rewards/grad_term": 0.0013105407124385238, "rewards/margins": 7.591399192810059, "rewards/rejected": -14.444866180419922, "step": 337 }, { "epoch": 0.7241564006427423, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.3962044900308355, "learning_rate": 4.970167064439141e-07, "logits/chosen": 0.36316683888435364, "logits/rejected": 0.26318785548210144, "logps/accuracies": 0.5, "logps/chosen": -459.98480224609375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -393.48419189453125, "logps/ref_rejected": -242.56094360351562, "logps/rejected": -432.6356506347656, "loss": 0.1479, "rewards/accuracies": 1.0, "rewards/chosen": -3.3250298500061035, "rewards/grad_term": 0.0004948938149027526, "rewards/margins": 6.178703784942627, "rewards/rejected": -9.50373363494873, "step": 338 }, { "epoch": 0.726298875200857, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.583520613174011, "learning_rate": 4.9618138424821e-07, "logits/chosen": 0.6698114275932312, "logits/rejected": 0.6079827547073364, "logps/accuracies": 1.0, "logps/chosen": -316.6778869628906, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -285.0715637207031, "logps/ref_rejected": -286.4187316894531, "logps/rejected": -387.62554931640625, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": -1.5803159475326538, "rewards/grad_term": 0.007696358487010002, "rewards/margins": 3.480024814605713, "rewards/rejected": -5.060340881347656, "step": 339 }, { "epoch": 0.7284413497589716, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.838033966465567, "learning_rate": 4.953460620525059e-07, "logits/chosen": 0.7049505710601807, "logits/rejected": 0.6646623015403748, "logps/accuracies": 1.0, "logps/chosen": -391.0535888671875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -312.54132080078125, "logps/ref_rejected": -331.8018798828125, "logps/rejected": -517.3717041015625, "loss": 0.146, "rewards/accuracies": 0.75, "rewards/chosen": -3.925614833831787, "rewards/grad_term": 0.007477066479623318, "rewards/margins": 5.35287618637085, "rewards/rejected": -9.278491020202637, "step": 340 }, { "epoch": 0.7305838243170862, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.719543349851092, "learning_rate": 4.945107398568019e-07, "logits/chosen": 0.6794430017471313, "logits/rejected": 0.5714715719223022, "logps/accuracies": 1.0, "logps/chosen": -349.4491271972656, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -272.2052001953125, "logps/ref_rejected": -253.03768920898438, "logps/rejected": -437.6853942871094, "loss": 0.1376, "rewards/accuracies": 1.0, "rewards/chosen": -3.862197160720825, "rewards/grad_term": 0.004704746417701244, "rewards/margins": 5.3701887130737305, "rewards/rejected": -9.232385635375977, "step": 341 }, { "epoch": 0.7327262988752009, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.451256306905016, "learning_rate": 4.936754176610979e-07, "logits/chosen": 0.6575830578804016, "logits/rejected": 0.6638262271881104, "logps/accuracies": 0.75, "logps/chosen": -314.11114501953125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -257.2018737792969, "logps/ref_rejected": -241.7303009033203, "logps/rejected": -397.7080078125, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": -2.845463275909424, "rewards/grad_term": 0.009044932201504707, "rewards/margins": 4.953423023223877, "rewards/rejected": -7.798886299133301, "step": 342 }, { "epoch": 0.7348687734333155, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.459693770757402, "learning_rate": 4.928400954653937e-07, "logits/chosen": 0.529110312461853, "logits/rejected": 0.46302855014801025, "logps/accuracies": 0.75, "logps/chosen": -384.07061767578125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -316.386962890625, "logps/ref_rejected": -298.5597839355469, "logps/rejected": -452.8936767578125, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": -3.3841824531555176, "rewards/grad_term": 0.0015116020804271102, "rewards/margins": 4.332512378692627, "rewards/rejected": -7.7166948318481445, "step": 343 }, { "epoch": 0.7370112479914301, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.920848833475498, "learning_rate": 4.920047732696897e-07, "logits/chosen": 0.7953596711158752, "logits/rejected": 0.7348934412002563, "logps/accuracies": 0.75, "logps/chosen": -594.716552734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -465.59210205078125, "logps/ref_rejected": -436.11572265625, "logps/rejected": -708.698974609375, "loss": 0.1812, "rewards/accuracies": 1.0, "rewards/chosen": -6.456222057342529, "rewards/grad_term": 0.0025254576466977596, "rewards/margins": 7.172940731048584, "rewards/rejected": -13.629162788391113, "step": 344 }, { "epoch": 0.7391537225495447, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.924633562541802, "learning_rate": 4.911694510739857e-07, "logits/chosen": 0.5842578411102295, "logits/rejected": 0.6216070652008057, "logps/accuracies": 1.0, "logps/chosen": -463.14154052734375, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -373.7330322265625, "logps/ref_rejected": -408.91278076171875, "logps/rejected": -685.0960693359375, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": -4.470427513122559, "rewards/grad_term": 0.0002844279515556991, "rewards/margins": 9.338738441467285, "rewards/rejected": -13.809165000915527, "step": 345 }, { "epoch": 0.7412961971076594, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 12.414755725942422, "learning_rate": 4.903341288782816e-07, "logits/chosen": 0.8328725099563599, "logits/rejected": 0.7645402550697327, "logps/accuracies": 0.75, "logps/chosen": -414.1654968261719, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -368.20654296875, "logps/ref_rejected": -308.8746337890625, "logps/rejected": -470.432373046875, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": -2.297947883605957, "rewards/grad_term": 0.00027060159482061863, "rewards/margins": 5.779940605163574, "rewards/rejected": -8.077888488769531, "step": 346 }, { "epoch": 0.7434386716657739, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.169966513004652, "learning_rate": 4.894988066825775e-07, "logits/chosen": 0.554535984992981, "logits/rejected": 0.265200138092041, "logps/accuracies": 0.75, "logps/chosen": -356.061767578125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -315.3848876953125, "logps/ref_rejected": -268.03143310546875, "logps/rejected": -405.561767578125, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": -2.033844470977783, "rewards/grad_term": 0.005571221467107534, "rewards/margins": 4.842672348022461, "rewards/rejected": -6.876516342163086, "step": 347 }, { "epoch": 0.7455811462238886, "flips/correct->correct": 1.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.180681687024289, "learning_rate": 4.886634844868734e-07, "logits/chosen": 0.6118199229240417, "logits/rejected": 0.6814651489257812, "logps/accuracies": 1.0, "logps/chosen": -246.84640502929688, "logps/ref_accuracies": 1.0, "logps/ref_chosen": -195.61514282226562, "logps/ref_rejected": -214.57460021972656, "logps/rejected": -361.24395751953125, "loss": 0.1331, "rewards/accuracies": 0.75, "rewards/chosen": -2.5615625381469727, "rewards/grad_term": 0.007279230747371912, "rewards/margins": 4.77190637588501, "rewards/rejected": -7.333469390869141, "step": 348 }, { "epoch": 0.7477236207820032, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.210731357114345, "learning_rate": 4.878281622911695e-07, "logits/chosen": 1.0106936693191528, "logits/rejected": 0.45131033658981323, "logps/accuracies": 0.5, "logps/chosen": -454.847412109375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -324.7191162109375, "logps/ref_rejected": -223.54873657226562, "logps/rejected": -440.8221740722656, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": -6.506414890289307, "rewards/grad_term": 0.009222909808158875, "rewards/margins": 4.357255935668945, "rewards/rejected": -10.86367130279541, "step": 349 }, { "epoch": 0.7498660953401178, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.362818321342788, "learning_rate": 4.869928400954653e-07, "logits/chosen": 0.586777925491333, "logits/rejected": 0.5780112147331238, "logps/accuracies": 1.0, "logps/chosen": -262.08038330078125, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -232.11459350585938, "logps/ref_rejected": -231.54193115234375, "logps/rejected": -376.2002868652344, "loss": 0.1459, "rewards/accuracies": 1.0, "rewards/chosen": -1.4982905387878418, "rewards/grad_term": 0.006848993711173534, "rewards/margins": 5.734626770019531, "rewards/rejected": -7.232917785644531, "step": 350 }, { "epoch": 0.7520085698982325, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.297264703165888, "learning_rate": 4.861575178997613e-07, "logits/chosen": 0.5312216877937317, "logits/rejected": 0.5838853716850281, "logps/accuracies": 1.0, "logps/chosen": -411.0349426269531, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -309.9207763671875, "logps/ref_rejected": -289.38275146484375, "logps/rejected": -511.20758056640625, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": -5.055708885192871, "rewards/grad_term": 0.0036189735401421785, "rewards/margins": 6.035533428192139, "rewards/rejected": -11.091242790222168, "step": 351 }, { "epoch": 0.7541510444563471, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.720091390841414, "learning_rate": 4.853221957040573e-07, "logits/chosen": 0.8272877931594849, "logits/rejected": 0.6150888204574585, "logps/accuracies": 0.75, "logps/chosen": -382.7689514160156, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -322.699951171875, "logps/ref_rejected": -250.23941040039062, "logps/rejected": -417.6976623535156, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": -3.0034492015838623, "rewards/grad_term": 0.0010191942565143108, "rewards/margins": 5.369463920593262, "rewards/rejected": -8.372913360595703, "step": 352 }, { "epoch": 0.7562935190144617, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.819345327687853, "learning_rate": 4.844868735083532e-07, "logits/chosen": 0.8716313242912292, "logits/rejected": 0.7711046934127808, "logps/accuracies": 0.75, "logps/chosen": -352.4783020019531, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -285.69561767578125, "logps/ref_rejected": -241.1641845703125, "logps/rejected": -428.366943359375, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": -3.3391335010528564, "rewards/grad_term": 0.006919885985553265, "rewards/margins": 6.021005153656006, "rewards/rejected": -9.360138893127441, "step": 353 }, { "epoch": 0.7584359935725763, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.544013532966439, "learning_rate": 4.836515513126491e-07, "logits/chosen": 0.8605263233184814, "logits/rejected": 0.7548648715019226, "logps/accuracies": 0.75, "logps/chosen": -513.380126953125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -414.0445861816406, "logps/ref_rejected": -390.1202392578125, "logps/rejected": -633.4596557617188, "loss": 0.1838, "rewards/accuracies": 1.0, "rewards/chosen": -4.966775417327881, "rewards/grad_term": 0.005854703951627016, "rewards/margins": 7.2001953125, "rewards/rejected": -12.166970252990723, "step": 354 }, { "epoch": 0.760578468130691, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.840668647752036, "learning_rate": 4.82816229116945e-07, "logits/chosen": 0.49805212020874023, "logits/rejected": 0.5557568669319153, "logps/accuracies": 0.75, "logps/chosen": -327.806884765625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -277.4596252441406, "logps/ref_rejected": -261.7403869628906, "logps/rejected": -415.97454833984375, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": -2.5173633098602295, "rewards/grad_term": 0.0034056631848216057, "rewards/margins": 5.194344997406006, "rewards/rejected": -7.711708068847656, "step": 355 }, { "epoch": 0.7627209426888055, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.886486362606452, "learning_rate": 4.819809069212411e-07, "logits/chosen": 0.523609459400177, "logits/rejected": 0.3943213224411011, "logps/accuracies": 1.0, "logps/chosen": -449.4044494628906, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -386.558349609375, "logps/ref_rejected": -362.3239440917969, "logps/rejected": -540.4671020507812, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": -3.1423020362854004, "rewards/grad_term": 0.000673395290505141, "rewards/margins": 5.76485538482666, "rewards/rejected": -8.907157897949219, "step": 356 }, { "epoch": 0.7648634172469202, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.772834608130855, "learning_rate": 4.811455847255369e-07, "logits/chosen": 0.7413110733032227, "logits/rejected": 0.6600308418273926, "logps/accuracies": 0.75, "logps/chosen": -372.83050537109375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -308.9735107421875, "logps/ref_rejected": -233.7576904296875, "logps/rejected": -412.01788330078125, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": -3.192850351333618, "rewards/grad_term": 0.0023573609068989754, "rewards/margins": 5.720157623291016, "rewards/rejected": -8.913007736206055, "step": 357 }, { "epoch": 0.7670058918050349, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.586255491609359, "learning_rate": 4.803102625298329e-07, "logits/chosen": 0.7194008827209473, "logits/rejected": 0.529259443283081, "logps/accuracies": 0.25, "logps/chosen": -470.25994873046875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -412.6962890625, "logps/ref_rejected": -344.50018310546875, "logps/rejected": -454.2938537597656, "loss": 0.1335, "rewards/accuracies": 1.0, "rewards/chosen": -2.8781819343566895, "rewards/grad_term": 0.006395381409674883, "rewards/margins": 2.6115007400512695, "rewards/rejected": -5.489683151245117, "step": 358 }, { "epoch": 0.7691483663631494, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.427948430933896, "learning_rate": 4.794749403341288e-07, "logits/chosen": 0.5997850298881531, "logits/rejected": 0.39114004373550415, "logps/accuracies": 0.5, "logps/chosen": -450.5962219238281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -360.58770751953125, "logps/ref_rejected": -266.60211181640625, "logps/rejected": -491.2527770996094, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": -4.500424861907959, "rewards/grad_term": 0.0012759572127833962, "rewards/margins": 6.7321085929870605, "rewards/rejected": -11.232534408569336, "step": 359 }, { "epoch": 0.7712908409212641, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.10147872579214, "learning_rate": 4.786396181384249e-07, "logits/chosen": 0.6526762247085571, "logits/rejected": 0.406318336725235, "logps/accuracies": 0.75, "logps/chosen": -362.89215087890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -327.1744079589844, "logps/ref_rejected": -287.7540588378906, "logps/rejected": -391.6522216796875, "loss": 0.1822, "rewards/accuracies": 1.0, "rewards/chosen": -1.7858877182006836, "rewards/grad_term": 0.003739753272384405, "rewards/margins": 3.4090189933776855, "rewards/rejected": -5.194906234741211, "step": 360 }, { "epoch": 0.7734333154793787, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.009801801350886, "learning_rate": 4.778042959427207e-07, "logits/chosen": 0.9286985993385315, "logits/rejected": 0.75478196144104, "logps/accuracies": 0.75, "logps/chosen": -341.165771484375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -286.45068359375, "logps/ref_rejected": -226.50819396972656, "logps/rejected": -388.7208251953125, "loss": 0.1502, "rewards/accuracies": 1.0, "rewards/chosen": -2.7357535362243652, "rewards/grad_term": 0.005150636192411184, "rewards/margins": 5.374879360198975, "rewards/rejected": -8.11063289642334, "step": 361 }, { "epoch": 0.7755757900374933, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.079915978073272, "learning_rate": 4.769689737470167e-07, "logits/chosen": 0.15154126286506653, "logits/rejected": 0.23431162536144257, "logps/accuracies": 0.75, "logps/chosen": -141.20352172851562, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -122.65444946289062, "logps/ref_rejected": -115.94908142089844, "logps/rejected": -179.49026489257812, "loss": 0.1532, "rewards/accuracies": 1.0, "rewards/chosen": -0.9274539351463318, "rewards/grad_term": 0.005933484528213739, "rewards/margins": 2.249605417251587, "rewards/rejected": -3.1770591735839844, "step": 362 }, { "epoch": 0.7777182645956079, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.620781698654811, "learning_rate": 4.7613365155131265e-07, "logits/chosen": 0.6806411147117615, "logits/rejected": 0.5877007842063904, "logps/accuracies": 1.0, "logps/chosen": -305.52105712890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -225.72100830078125, "logps/ref_rejected": -230.88204956054688, "logps/rejected": -416.0865783691406, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": -3.9900035858154297, "rewards/grad_term": 0.0007602861733175814, "rewards/margins": 5.270223617553711, "rewards/rejected": -9.26022720336914, "step": 363 }, { "epoch": 0.7798607391537226, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.162163225237173, "learning_rate": 4.752983293556086e-07, "logits/chosen": 0.3107702434062958, "logits/rejected": 0.46145111322402954, "logps/accuracies": 0.75, "logps/chosen": -376.48138427734375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -298.7530517578125, "logps/ref_rejected": -248.01364135742188, "logps/rejected": -483.236572265625, "loss": 0.1518, "rewards/accuracies": 1.0, "rewards/chosen": -3.8864171504974365, "rewards/grad_term": 0.0027246952522546053, "rewards/margins": 7.874729156494141, "rewards/rejected": -11.761146545410156, "step": 364 }, { "epoch": 0.7820032137118371, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.682442207333185, "learning_rate": 4.744630071599045e-07, "logits/chosen": 0.8286511301994324, "logits/rejected": 0.7338289618492126, "logps/accuracies": 0.75, "logps/chosen": -475.2028503417969, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -419.9889221191406, "logps/ref_rejected": -337.7398681640625, "logps/rejected": -524.1643676757812, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": -2.7606959342956543, "rewards/grad_term": 0.0009419883135706186, "rewards/margins": 6.5605316162109375, "rewards/rejected": -9.321227073669434, "step": 365 }, { "epoch": 0.7841456882699518, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.969521309776873, "learning_rate": 4.736276849642005e-07, "logits/chosen": 0.4039073884487152, "logits/rejected": 0.4401341676712036, "logps/accuracies": 0.75, "logps/chosen": -332.14044189453125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -283.09283447265625, "logps/ref_rejected": -206.31539916992188, "logps/rejected": -350.80340576171875, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": -2.4523794651031494, "rewards/grad_term": 0.0033193090930581093, "rewards/margins": 4.772021293640137, "rewards/rejected": -7.224400997161865, "step": 366 }, { "epoch": 0.7862881628280665, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.54876490811146, "learning_rate": 4.727923627684964e-07, "logits/chosen": 0.3236476182937622, "logits/rejected": 0.3070971667766571, "logps/accuracies": 0.5, "logps/chosen": -317.3150329589844, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -236.6580810546875, "logps/ref_rejected": -158.20616149902344, "logps/rejected": -292.539794921875, "loss": 0.1308, "rewards/accuracies": 0.75, "rewards/chosen": -4.0328474044799805, "rewards/grad_term": 0.016018129885196686, "rewards/margins": 2.683833599090576, "rewards/rejected": -6.716681003570557, "step": 367 }, { "epoch": 0.788430637386181, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.13550622656799, "learning_rate": 4.7195704057279233e-07, "logits/chosen": 0.762974739074707, "logits/rejected": 0.506515622138977, "logps/accuracies": 0.5, "logps/chosen": -476.596923828125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -387.86663818359375, "logps/ref_rejected": -268.920166015625, "logps/rejected": -499.3272705078125, "loss": 0.1571, "rewards/accuracies": 1.0, "rewards/chosen": -4.4365129470825195, "rewards/grad_term": 0.0013110407162457705, "rewards/margins": 7.083842754364014, "rewards/rejected": -11.520356178283691, "step": 368 }, { "epoch": 0.7905731119442957, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.573319632626368, "learning_rate": 4.7112171837708825e-07, "logits/chosen": 0.9512593150138855, "logits/rejected": 0.7041115164756775, "logps/accuracies": 0.75, "logps/chosen": -443.4069519042969, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -363.83258056640625, "logps/ref_rejected": -308.1307373046875, "logps/rejected": -442.51458740234375, "loss": 0.1507, "rewards/accuracies": 1.0, "rewards/chosen": -3.9787182807922363, "rewards/grad_term": 0.005531268659979105, "rewards/margins": 2.7404749393463135, "rewards/rejected": -6.719193458557129, "step": 369 }, { "epoch": 0.7927155865024103, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.584409310189419, "learning_rate": 4.7028639618138423e-07, "logits/chosen": 0.6789947152137756, "logits/rejected": 0.46235227584838867, "logps/accuracies": 0.5, "logps/chosen": -433.4268798828125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -329.707763671875, "logps/ref_rejected": -254.446533203125, "logps/rejected": -444.1405334472656, "loss": 0.1189, "rewards/accuracies": 0.75, "rewards/chosen": -5.185955047607422, "rewards/grad_term": 0.010698116384446621, "rewards/margins": 4.2987446784973145, "rewards/rejected": -9.484700202941895, "step": 370 }, { "epoch": 0.7948580610605249, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.82663440003344, "learning_rate": 4.694510739856802e-07, "logits/chosen": 0.8684903383255005, "logits/rejected": 0.864480197429657, "logps/accuracies": 1.0, "logps/chosen": -616.7009887695312, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -439.9715270996094, "logps/ref_rejected": -483.77447509765625, "logps/rejected": -717.4370727539062, "loss": 0.1749, "rewards/accuracies": 0.75, "rewards/chosen": -8.83647346496582, "rewards/grad_term": 0.009361391887068748, "rewards/margins": 2.84665846824646, "rewards/rejected": -11.68313217163086, "step": 371 }, { "epoch": 0.7970005356186395, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.700184569183158, "learning_rate": 4.686157517899761e-07, "logits/chosen": 0.3798208236694336, "logits/rejected": 0.18705379962921143, "logps/accuracies": 0.75, "logps/chosen": -290.2025451660156, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -256.36285400390625, "logps/ref_rejected": -187.69712829589844, "logps/rejected": -312.7464599609375, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": -1.6919848918914795, "rewards/grad_term": 0.0075987353920936584, "rewards/margins": 4.560481548309326, "rewards/rejected": -6.252466201782227, "step": 372 }, { "epoch": 0.7991430101767542, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.246215247843911, "learning_rate": 4.6778042959427206e-07, "logits/chosen": 0.8069986701011658, "logits/rejected": 0.7122650146484375, "logps/accuracies": 1.0, "logps/chosen": -267.30328369140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -210.80979919433594, "logps/ref_rejected": -209.55477905273438, "logps/rejected": -342.04205322265625, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": -2.8246755599975586, "rewards/grad_term": 0.003649149788543582, "rewards/margins": 3.799687385559082, "rewards/rejected": -6.624362468719482, "step": 373 }, { "epoch": 0.8012854847348688, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.128291731066904, "learning_rate": 4.6694510739856804e-07, "logits/chosen": 0.8819460868835449, "logits/rejected": 0.760127067565918, "logps/accuracies": 0.75, "logps/chosen": -614.6334228515625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -528.1080932617188, "logps/ref_rejected": -390.6625061035156, "logps/rejected": -600.1669921875, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": -4.326266288757324, "rewards/grad_term": 0.0019579888321459293, "rewards/margins": 6.148959159851074, "rewards/rejected": -10.475224494934082, "step": 374 }, { "epoch": 0.8034279592929834, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 15.828594310560685, "learning_rate": 4.661097852028639e-07, "logits/chosen": 0.8489161729812622, "logits/rejected": 0.753060519695282, "logps/accuracies": 0.5, "logps/chosen": -382.72735595703125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -323.9566650390625, "logps/ref_rejected": -298.6670227050781, "logps/rejected": -467.1494140625, "loss": 0.1952, "rewards/accuracies": 1.0, "rewards/chosen": -2.938535213470459, "rewards/grad_term": 0.004120782017707825, "rewards/margins": 5.485583305358887, "rewards/rejected": -8.424118041992188, "step": 375 }, { "epoch": 0.8055704338510981, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 10.583125585454663, "learning_rate": 4.652744630071599e-07, "logits/chosen": 0.5661925673484802, "logits/rejected": 0.3597102761268616, "logps/accuracies": 0.5, "logps/chosen": -306.2489318847656, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -252.90496826171875, "logps/ref_rejected": -201.43569946289062, "logps/rejected": -383.89324951171875, "loss": 0.1964, "rewards/accuracies": 1.0, "rewards/chosen": -2.6671996116638184, "rewards/grad_term": 0.0024612259585410357, "rewards/margins": 6.455678462982178, "rewards/rejected": -9.122878074645996, "step": 376 }, { "epoch": 0.8077129084092126, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.4392788570254815, "learning_rate": 4.644391408114558e-07, "logits/chosen": 0.3249109983444214, "logits/rejected": 0.32942840456962585, "logps/accuracies": 0.75, "logps/chosen": -448.28216552734375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -387.504638671875, "logps/ref_rejected": -307.7704162597656, "logps/rejected": -473.29437255859375, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": -3.0388755798339844, "rewards/grad_term": 0.0016592949395999312, "rewards/margins": 5.237322807312012, "rewards/rejected": -8.276198387145996, "step": 377 }, { "epoch": 0.8098553829673273, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.329181119972007, "learning_rate": 4.636038186157518e-07, "logits/chosen": 0.404056191444397, "logits/rejected": 0.6022278070449829, "logps/accuracies": 1.0, "logps/chosen": -382.3240051269531, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -305.3075256347656, "logps/ref_rejected": -321.82904052734375, "logps/rejected": -545.7882080078125, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": -3.8508248329162598, "rewards/grad_term": 0.0002711013949010521, "rewards/margins": 7.347134590148926, "rewards/rejected": -11.197959899902344, "step": 378 }, { "epoch": 0.8119978575254418, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.175949590736797, "learning_rate": 4.6276849642004767e-07, "logits/chosen": 0.751959502696991, "logits/rejected": 0.5633019804954529, "logps/accuracies": 1.0, "logps/chosen": -502.3103942871094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -433.314697265625, "logps/ref_rejected": -358.736083984375, "logps/rejected": -592.6541748046875, "loss": 0.1823, "rewards/accuracies": 1.0, "rewards/chosen": -3.449784278869629, "rewards/grad_term": 4.590416210703552e-05, "rewards/margins": 8.24611759185791, "rewards/rejected": -11.695901870727539, "step": 379 }, { "epoch": 0.8141403320835565, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.61637016421897, "learning_rate": 4.6193317422434364e-07, "logits/chosen": 0.5796254873275757, "logits/rejected": 0.6662420630455017, "logps/accuracies": 1.0, "logps/chosen": -412.05316162109375, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -339.7696533203125, "logps/ref_rejected": -340.8929748535156, "logps/rejected": -524.9725341796875, "loss": 0.1719, "rewards/accuracies": 1.0, "rewards/chosen": -3.61417555809021, "rewards/grad_term": 0.0034167964477092028, "rewards/margins": 5.589802265167236, "rewards/rejected": -9.203977584838867, "step": 380 }, { "epoch": 0.8162828066416711, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.094201585504359, "learning_rate": 4.610978520286396e-07, "logits/chosen": 0.7151302695274353, "logits/rejected": 0.6419375538825989, "logps/accuracies": 1.0, "logps/chosen": -305.9559326171875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -270.7724609375, "logps/ref_rejected": -270.0223083496094, "logps/rejected": -445.2619934082031, "loss": 0.125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7591745853424072, "rewards/grad_term": 0.0021109317895025015, "rewards/margins": 7.002810001373291, "rewards/rejected": -8.761983871459961, "step": 381 }, { "epoch": 0.8184252811997857, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 14.362881277342565, "learning_rate": 4.602625298329356e-07, "logits/chosen": 0.8437117338180542, "logits/rejected": 0.7384243607521057, "logps/accuracies": 0.5, "logps/chosen": -461.12603759765625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -386.73773193359375, "logps/ref_rejected": -304.27459716796875, "logps/rejected": -513.4872436523438, "loss": 0.1601, "rewards/accuracies": 1.0, "rewards/chosen": -3.7194161415100098, "rewards/grad_term": 0.00258549302816391, "rewards/margins": 6.741215705871582, "rewards/rejected": -10.46063232421875, "step": 382 }, { "epoch": 0.8205677557579004, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.456611514133404, "learning_rate": 4.594272076372315e-07, "logits/chosen": 0.7477813363075256, "logits/rejected": 0.3820553123950958, "logps/accuracies": 0.75, "logps/chosen": -507.67462158203125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -389.72613525390625, "logps/ref_rejected": -367.64569091796875, "logps/rejected": -579.4849243164062, "loss": 0.1604, "rewards/accuracies": 1.0, "rewards/chosen": -5.89742374420166, "rewards/grad_term": 0.007863424718379974, "rewards/margins": 4.694538593292236, "rewards/rejected": -10.591961860656738, "step": 383 }, { "epoch": 0.822710230316015, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.8619420532382325, "learning_rate": 4.5859188544152745e-07, "logits/chosen": 0.7787905931472778, "logits/rejected": 0.767812967300415, "logps/accuracies": 0.75, "logps/chosen": -384.25, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -312.675537109375, "logps/ref_rejected": -270.84429931640625, "logps/rejected": -437.036376953125, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": -3.5787243843078613, "rewards/grad_term": 0.003301040269434452, "rewards/margins": 4.730880260467529, "rewards/rejected": -8.30960464477539, "step": 384 }, { "epoch": 0.822710230316015, "eval_flips/correct->correct": 0.14000000059604645, "eval_flips/correct->incorrect": 0.019999999552965164, "eval_flips/incorrect->correct": 0.5400000214576721, "eval_flips/incorrect->incorrect": 0.30000001192092896, "eval_logits/chosen": 0.6912816762924194, "eval_logits/rejected": 0.5833743810653687, "eval_logps/accuracies": 0.6800000071525574, "eval_logps/chosen": -383.8902282714844, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -404.5517578125, "eval_loss": 0.15501657128334045, "eval_rewards/accuracies": 0.8799999952316284, "eval_rewards/chosen": -3.018728256225586, "eval_rewards/grad_term": 0.00715669384226203, "eval_rewards/margins": 4.273808002471924, "eval_rewards/rejected": -7.292536735534668, "eval_runtime": 372.7419, "eval_samples_per_second": 4.239, "eval_steps_per_second": 0.134, "step": 384 }, { "epoch": 0.8248527048741296, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 8.34443643722511, "learning_rate": 4.577565632458234e-07, "logits/chosen": 0.6030393242835999, "logits/rejected": 0.5619946718215942, "logps/accuracies": 0.5, "logps/chosen": -445.7275695800781, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -341.169921875, "logps/ref_rejected": -280.4991455078125, "logps/rejected": -452.7607116699219, "loss": 0.1376, "rewards/accuracies": 1.0, "rewards/chosen": -5.227883815765381, "rewards/grad_term": 0.003972820471972227, "rewards/margins": 3.3851945400238037, "rewards/rejected": -8.613078117370605, "step": 385 }, { "epoch": 0.8269951794322442, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.067319624450543, "learning_rate": 4.569212410501193e-07, "logits/chosen": 0.6664745211601257, "logits/rejected": 0.6004454493522644, "logps/accuracies": 0.5, "logps/chosen": -461.43505859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -375.47607421875, "logps/ref_rejected": -329.8832092285156, "logps/rejected": -491.4964599609375, "loss": 0.1263, "rewards/accuracies": 1.0, "rewards/chosen": -4.297948360443115, "rewards/grad_term": 0.006247952580451965, "rewards/margins": 3.7827141284942627, "rewards/rejected": -8.080662727355957, "step": 386 }, { "epoch": 0.8291376539903589, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.42647077537532, "learning_rate": 4.5608591885441523e-07, "logits/chosen": 0.6861754655838013, "logits/rejected": 0.39711299538612366, "logps/accuracies": 0.75, "logps/chosen": -266.0076904296875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -222.40213012695312, "logps/ref_rejected": -151.0525665283203, "logps/rejected": -257.8247985839844, "loss": 0.1474, "rewards/accuracies": 1.0, "rewards/chosen": -2.1802759170532227, "rewards/grad_term": 0.0029023890383541584, "rewards/margins": 3.1583352088928223, "rewards/rejected": -5.338611125946045, "step": 387 }, { "epoch": 0.8312801285484734, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.54998768600324, "learning_rate": 4.552505966587112e-07, "logits/chosen": 0.7813968658447266, "logits/rejected": 0.675197184085846, "logps/accuracies": 0.75, "logps/chosen": -384.80474853515625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -335.5448913574219, "logps/ref_rejected": -288.6382141113281, "logps/rejected": -429.424072265625, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": -2.462993621826172, "rewards/grad_term": 0.004009037744253874, "rewards/margins": 4.57629919052124, "rewards/rejected": -7.039292335510254, "step": 388 }, { "epoch": 0.8334226031065881, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.3771391402744175, "learning_rate": 4.544152744630072e-07, "logits/chosen": 0.5336940884590149, "logits/rejected": 0.61911940574646, "logps/accuracies": 1.0, "logps/chosen": -435.4273986816406, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -310.7679443359375, "logps/ref_rejected": -220.2869110107422, "logps/rejected": -539.0606079101562, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": -6.232973098754883, "rewards/grad_term": 0.002168377162888646, "rewards/margins": 9.705711364746094, "rewards/rejected": -15.938684463500977, "step": 389 }, { "epoch": 0.8355650776647028, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.772490840913882, "learning_rate": 4.5357995226730306e-07, "logits/chosen": 0.7710881233215332, "logits/rejected": 0.52399080991745, "logps/accuracies": 0.75, "logps/chosen": -394.009033203125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -299.70672607421875, "logps/ref_rejected": -240.57080078125, "logps/rejected": -437.76953125, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": -4.715115070343018, "rewards/grad_term": 0.0006557138403877616, "rewards/margins": 5.14482307434082, "rewards/rejected": -9.85993766784668, "step": 390 }, { "epoch": 0.8377075522228173, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.038299673221383, "learning_rate": 4.5274463007159904e-07, "logits/chosen": 0.825120747089386, "logits/rejected": 0.6199119687080383, "logps/accuracies": 0.25, "logps/chosen": -434.44329833984375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -317.6658630371094, "logps/ref_rejected": -241.26890563964844, "logps/rejected": -444.71636962890625, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": -5.838871002197266, "rewards/grad_term": 0.001953305210918188, "rewards/margins": 4.333502769470215, "rewards/rejected": -10.172372817993164, "step": 391 }, { "epoch": 0.839850026780932, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.157799585200374, "learning_rate": 4.5190930787589496e-07, "logits/chosen": 0.7274588346481323, "logits/rejected": 0.658560037612915, "logps/accuracies": 1.0, "logps/chosen": -377.17510986328125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -304.23382568359375, "logps/ref_rejected": -287.71441650390625, "logps/rejected": -464.94073486328125, "loss": 0.1512, "rewards/accuracies": 1.0, "rewards/chosen": -3.6470634937286377, "rewards/grad_term": 0.0007109759608283639, "rewards/margins": 5.214252948760986, "rewards/rejected": -8.861316680908203, "step": 392 }, { "epoch": 0.8419925013390466, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.274433302537343, "learning_rate": 4.510739856801909e-07, "logits/chosen": 0.5300111770629883, "logits/rejected": 0.4925421476364136, "logps/accuracies": 0.75, "logps/chosen": -535.7554321289062, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -405.7460021972656, "logps/ref_rejected": -341.57281494140625, "logps/rejected": -637.828369140625, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": -6.500471115112305, "rewards/grad_term": 0.0015781800029799342, "rewards/margins": 8.312305450439453, "rewards/rejected": -14.812776565551758, "step": 393 }, { "epoch": 0.8441349758971612, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.636233486579053, "learning_rate": 4.502386634844868e-07, "logits/chosen": 0.4728167653083801, "logits/rejected": 0.7265413999557495, "logps/accuracies": 1.0, "logps/chosen": -329.78082275390625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -262.4573669433594, "logps/ref_rejected": -250.28829956054688, "logps/rejected": -432.49798583984375, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": -3.3661720752716064, "rewards/grad_term": 0.002081993967294693, "rewards/margins": 5.744311332702637, "rewards/rejected": -9.110483169555664, "step": 394 }, { "epoch": 0.8462774504552758, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.191471059477506, "learning_rate": 4.494033412887828e-07, "logits/chosen": 0.5292783975601196, "logits/rejected": 0.2622219920158386, "logps/accuracies": 0.75, "logps/chosen": -296.090576171875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -242.09730529785156, "logps/ref_rejected": -185.52487182617188, "logps/rejected": -350.0084228515625, "loss": 0.1594, "rewards/accuracies": 1.0, "rewards/chosen": -2.6996638774871826, "rewards/grad_term": 0.004369885195046663, "rewards/margins": 5.5245137214660645, "rewards/rejected": -8.224177360534668, "step": 395 }, { "epoch": 0.8484199250133905, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.697422074983217, "learning_rate": 4.4856801909307877e-07, "logits/chosen": 0.5663112998008728, "logits/rejected": 0.41860711574554443, "logps/accuracies": 0.75, "logps/chosen": -370.9486083984375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -322.1562805175781, "logps/ref_rejected": -241.59457397460938, "logps/rejected": -408.6671142578125, "loss": 0.1077, "rewards/accuracies": 0.75, "rewards/chosen": -2.439617156982422, "rewards/grad_term": 0.0069171166978776455, "rewards/margins": 5.914010047912598, "rewards/rejected": -8.353628158569336, "step": 396 }, { "epoch": 0.850562399571505, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 13.015020922003387, "learning_rate": 4.4773269689737464e-07, "logits/chosen": 0.7214323282241821, "logits/rejected": 0.633575975894928, "logps/accuracies": 1.0, "logps/chosen": -596.1569213867188, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -488.6678161621094, "logps/ref_rejected": -393.7802734375, "logps/rejected": -751.7131958007812, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": -5.374455451965332, "rewards/grad_term": 6.355544996949902e-07, "rewards/margins": 12.522188186645508, "rewards/rejected": -17.896644592285156, "step": 397 }, { "epoch": 0.8527048741296197, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 4.892748358005199, "learning_rate": 4.468973747016706e-07, "logits/chosen": 0.7805695533752441, "logits/rejected": 0.6413770914077759, "logps/accuracies": 1.0, "logps/chosen": -511.5725402832031, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -438.80120849609375, "logps/ref_rejected": -409.7392578125, "logps/rejected": -677.0660400390625, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": -3.6385655403137207, "rewards/grad_term": 2.607361057016533e-05, "rewards/margins": 9.727773666381836, "rewards/rejected": -13.366338729858398, "step": 398 }, { "epoch": 0.8548473486877344, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 12.448276262372078, "learning_rate": 4.460620525059666e-07, "logits/chosen": 0.7015663385391235, "logits/rejected": 0.5971590280532837, "logps/accuracies": 1.0, "logps/chosen": -475.1459655761719, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -387.7801818847656, "logps/ref_rejected": -357.5787353515625, "logps/rejected": -657.3110961914062, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": -4.368288993835449, "rewards/grad_term": 0.0006424501189030707, "rewards/margins": 10.618330955505371, "rewards/rejected": -14.98661994934082, "step": 399 }, { "epoch": 0.8569898232458489, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.52996684599927, "learning_rate": 4.452267303102625e-07, "logits/chosen": 0.5823007822036743, "logits/rejected": 0.5328406095504761, "logps/accuracies": 1.0, "logps/chosen": -302.14312744140625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -234.80886840820312, "logps/ref_rejected": -250.11843872070312, "logps/rejected": -457.92333984375, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": -3.3667118549346924, "rewards/grad_term": 0.0009050341905094683, "rewards/margins": 7.023532867431641, "rewards/rejected": -10.390244483947754, "step": 400 }, { "epoch": 0.8591322978039636, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.537952351751538, "learning_rate": 4.4439140811455845e-07, "logits/chosen": 0.929060161113739, "logits/rejected": 0.7344148755073547, "logps/accuracies": 0.5, "logps/chosen": -484.63690185546875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -397.1466064453125, "logps/ref_rejected": -278.0077819824219, "logps/rejected": -507.8617858886719, "loss": 0.1082, "rewards/accuracies": 1.0, "rewards/chosen": -4.374514102935791, "rewards/grad_term": 0.0002766952384263277, "rewards/margins": 7.11818790435791, "rewards/rejected": -11.492701530456543, "step": 401 }, { "epoch": 0.8612747723620782, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.644686107983424, "learning_rate": 4.435560859188544e-07, "logits/chosen": 0.771916389465332, "logits/rejected": 0.6290404200553894, "logps/accuracies": 0.75, "logps/chosen": -315.5869140625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -250.51466369628906, "logps/ref_rejected": -214.51107788085938, "logps/rejected": -375.64447021484375, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": -3.253613233566284, "rewards/grad_term": 0.0008995306561701, "rewards/margins": 4.803055763244629, "rewards/rejected": -8.056669235229492, "step": 402 }, { "epoch": 0.8634172469201928, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.733257087221838, "learning_rate": 4.4272076372315035e-07, "logits/chosen": 0.7453795075416565, "logits/rejected": 0.6315554976463318, "logps/accuracies": 1.0, "logps/chosen": -232.2364501953125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -196.71493530273438, "logps/ref_rejected": -155.52903747558594, "logps/rejected": -270.3780822753906, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": -1.7760753631591797, "rewards/grad_term": 0.002003555418923497, "rewards/margins": 3.966376543045044, "rewards/rejected": -5.7424516677856445, "step": 403 }, { "epoch": 0.8655597214783074, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.60971738924586, "learning_rate": 4.418854415274462e-07, "logits/chosen": 0.7506588697433472, "logits/rejected": 0.6142828464508057, "logps/accuracies": 1.0, "logps/chosen": -397.05242919921875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -317.8343505859375, "logps/ref_rejected": -303.29852294921875, "logps/rejected": -455.8990783691406, "loss": 0.1456, "rewards/accuracies": 1.0, "rewards/chosen": -3.9609031677246094, "rewards/grad_term": 0.0018456345424056053, "rewards/margins": 3.6691231727600098, "rewards/rejected": -7.630025863647461, "step": 404 }, { "epoch": 0.8677021960364221, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.056320190778132, "learning_rate": 4.410501193317422e-07, "logits/chosen": 0.5372669696807861, "logits/rejected": 0.5302236676216125, "logps/accuracies": 0.75, "logps/chosen": -419.73486328125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -331.661865234375, "logps/ref_rejected": -297.73260498046875, "logps/rejected": -506.001220703125, "loss": 0.1351, "rewards/accuracies": 1.0, "rewards/chosen": -4.403650283813477, "rewards/grad_term": 0.0054626427590847015, "rewards/margins": 6.0097808837890625, "rewards/rejected": -10.413432121276855, "step": 405 }, { "epoch": 0.8698446705945367, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.174234002962357, "learning_rate": 4.402147971360382e-07, "logits/chosen": 0.7001669406890869, "logits/rejected": 0.39515483379364014, "logps/accuracies": 0.75, "logps/chosen": -257.07257080078125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -186.89627075195312, "logps/ref_rejected": -187.3160400390625, "logps/rejected": -366.7100524902344, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": -3.508815288543701, "rewards/grad_term": 0.0030996918212622404, "rewards/margins": 5.4608845710754395, "rewards/rejected": -8.96969985961914, "step": 406 }, { "epoch": 0.8719871451526513, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.813541027309185, "learning_rate": 4.3937947494033416e-07, "logits/chosen": 0.7106292247772217, "logits/rejected": 0.17329132556915283, "logps/accuracies": 0.5, "logps/chosen": -406.671142578125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -309.2118835449219, "logps/ref_rejected": -226.40023803710938, "logps/rejected": -371.4858093261719, "loss": 0.1001, "rewards/accuracies": 1.0, "rewards/chosen": -4.872962474822998, "rewards/grad_term": 0.007784712128341198, "rewards/margins": 2.381316900253296, "rewards/rejected": -7.254279136657715, "step": 407 }, { "epoch": 0.874129619710766, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.142665145862327, "learning_rate": 4.3854415274463003e-07, "logits/chosen": 0.6431873440742493, "logits/rejected": 0.5117607116699219, "logps/accuracies": 0.5, "logps/chosen": -388.5247802734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -360.99249267578125, "logps/ref_rejected": -295.61029052734375, "logps/rejected": -396.9288330078125, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": -1.3766143321990967, "rewards/grad_term": 0.008713779971003532, "rewards/margins": 3.689311981201172, "rewards/rejected": -5.0659260749816895, "step": 408 }, { "epoch": 0.8762720942688805, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.302023436651439, "learning_rate": 4.37708830548926e-07, "logits/chosen": 0.7255518436431885, "logits/rejected": 0.6550572514533997, "logps/accuracies": 1.0, "logps/chosen": -503.82879638671875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -395.3870849609375, "logps/ref_rejected": -354.33709716796875, "logps/rejected": -625.1456298828125, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": -5.422085285186768, "rewards/grad_term": 0.0003362699644640088, "rewards/margins": 8.118339538574219, "rewards/rejected": -13.540424346923828, "step": 409 }, { "epoch": 0.8784145688269952, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.292464980810114, "learning_rate": 4.3687350835322194e-07, "logits/chosen": 0.6224971413612366, "logits/rejected": 0.7813397645950317, "logps/accuracies": 1.0, "logps/chosen": -246.21495056152344, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -178.90476989746094, "logps/ref_rejected": -192.27903747558594, "logps/rejected": -309.7457275390625, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": -3.365509033203125, "rewards/grad_term": 0.005968024022877216, "rewards/margins": 2.5078253746032715, "rewards/rejected": -5.8733344078063965, "step": 410 }, { "epoch": 0.8805570433851098, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.994486636309479, "learning_rate": 4.360381861575179e-07, "logits/chosen": 0.5689117908477783, "logits/rejected": 0.4422415494918823, "logps/accuracies": 0.75, "logps/chosen": -327.41168212890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -288.2694091796875, "logps/ref_rejected": -235.6809844970703, "logps/rejected": -354.2411804199219, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": -1.9571146965026855, "rewards/grad_term": 0.01035099197179079, "rewards/margins": 3.9708945751190186, "rewards/rejected": -5.928009033203125, "step": 411 }, { "epoch": 0.8826995179432244, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.858727469447434, "learning_rate": 4.352028639618138e-07, "logits/chosen": 0.758124589920044, "logits/rejected": 0.7636011242866516, "logps/accuracies": 1.0, "logps/chosen": -433.85906982421875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -363.50360107421875, "logps/ref_rejected": -367.9163513183594, "logps/rejected": -580.310791015625, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": -3.5177736282348633, "rewards/grad_term": 0.0005504750879481435, "rewards/margins": 7.101946830749512, "rewards/rejected": -10.619720458984375, "step": 412 }, { "epoch": 0.884841992501339, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.354863325065816, "learning_rate": 4.3436754176610977e-07, "logits/chosen": 0.6459320783615112, "logits/rejected": 0.5770057439804077, "logps/accuracies": 1.0, "logps/chosen": -574.0004272460938, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -458.69244384765625, "logps/ref_rejected": -463.82720947265625, "logps/rejected": -754.6375122070312, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": -5.7653985023498535, "rewards/grad_term": 0.00017192790983244777, "rewards/margins": 8.775115966796875, "rewards/rejected": -14.540514945983887, "step": 413 }, { "epoch": 0.8869844670594537, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.641937024128401, "learning_rate": 4.3353221957040575e-07, "logits/chosen": 0.6838860511779785, "logits/rejected": 0.5406701564788818, "logps/accuracies": 0.75, "logps/chosen": -305.5446472167969, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -251.28353881835938, "logps/ref_rejected": -207.93319702148438, "logps/rejected": -345.9137268066406, "loss": 0.1227, "rewards/accuracies": 1.0, "rewards/chosen": -2.7130556106567383, "rewards/grad_term": 0.000992645276710391, "rewards/margins": 4.185970306396484, "rewards/rejected": -6.899025917053223, "step": 414 }, { "epoch": 0.8891269416175683, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.853463086288327, "learning_rate": 4.326968973747016e-07, "logits/chosen": 0.6326106786727905, "logits/rejected": 0.5018441677093506, "logps/accuracies": 1.0, "logps/chosen": -350.4998779296875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -291.879150390625, "logps/ref_rejected": -234.96595764160156, "logps/rejected": -414.79351806640625, "loss": 0.1072, "rewards/accuracies": 0.75, "rewards/chosen": -2.931034564971924, "rewards/grad_term": 0.007088256999850273, "rewards/margins": 6.060344696044922, "rewards/rejected": -8.991378784179688, "step": 415 }, { "epoch": 0.8912694161756829, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.353678579870046, "learning_rate": 4.318615751789976e-07, "logits/chosen": 0.7541974186897278, "logits/rejected": 0.6350035667419434, "logps/accuracies": 1.0, "logps/chosen": -488.8894348144531, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -403.95233154296875, "logps/ref_rejected": -311.3324890136719, "logps/rejected": -574.9945678710938, "loss": 0.1862, "rewards/accuracies": 1.0, "rewards/chosen": -4.246854782104492, "rewards/grad_term": 0.0008102880092337728, "rewards/margins": 8.936247825622559, "rewards/rejected": -13.18310260772705, "step": 416 }, { "epoch": 0.8934118907337976, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.150177255687934, "learning_rate": 4.310262529832935e-07, "logits/chosen": 0.7215501666069031, "logits/rejected": 0.7029194235801697, "logps/accuracies": 1.0, "logps/chosen": -449.3460693359375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -361.64422607421875, "logps/ref_rejected": -362.3080139160156, "logps/rejected": -613.3135375976562, "loss": 0.1859, "rewards/accuracies": 1.0, "rewards/chosen": -4.385092258453369, "rewards/grad_term": 0.0001239069679286331, "rewards/margins": 8.165184020996094, "rewards/rejected": -12.550276756286621, "step": 417 }, { "epoch": 0.8955543652919121, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.057751158218558, "learning_rate": 4.301909307875895e-07, "logits/chosen": 0.49071842432022095, "logits/rejected": 0.4239951968193054, "logps/accuracies": 0.25, "logps/chosen": -355.53564453125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -259.5860900878906, "logps/ref_rejected": -194.45254516601562, "logps/rejected": -348.5555725097656, "loss": 0.1136, "rewards/accuracies": 1.0, "rewards/chosen": -4.797478675842285, "rewards/grad_term": 0.01176033727824688, "rewards/margins": 2.9076716899871826, "rewards/rejected": -7.705150127410889, "step": 418 }, { "epoch": 0.8976968398500268, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.128643742839769, "learning_rate": 4.2935560859188537e-07, "logits/chosen": 0.5078434944152832, "logits/rejected": 0.3587588369846344, "logps/accuracies": 0.75, "logps/chosen": -334.2418518066406, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -276.947998046875, "logps/ref_rejected": -290.69976806640625, "logps/rejected": -465.8039245605469, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": -2.864691972732544, "rewards/grad_term": 0.001770102884620428, "rewards/margins": 5.890515327453613, "rewards/rejected": -8.755208015441895, "step": 419 }, { "epoch": 0.8998393144081414, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.460646869207045, "learning_rate": 4.2852028639618135e-07, "logits/chosen": 0.6151570081710815, "logits/rejected": 0.5713327527046204, "logps/accuracies": 0.75, "logps/chosen": -357.33209228515625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -289.7698974609375, "logps/ref_rejected": -239.04769897460938, "logps/rejected": -366.888427734375, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": -3.378108501434326, "rewards/grad_term": 0.004311670083552599, "rewards/margins": 3.013929843902588, "rewards/rejected": -6.392038345336914, "step": 420 }, { "epoch": 0.901981788966256, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.054391516220369, "learning_rate": 4.2768496420047733e-07, "logits/chosen": 0.6619023084640503, "logits/rejected": 0.6487151980400085, "logps/accuracies": 0.75, "logps/chosen": -419.2369689941406, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -322.2930603027344, "logps/ref_rejected": -283.63616943359375, "logps/rejected": -483.73504638671875, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": -4.847194671630859, "rewards/grad_term": 0.0035053249448537827, "rewards/margins": 5.157749652862549, "rewards/rejected": -10.004944801330566, "step": 421 }, { "epoch": 0.9041242635243707, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.627822881128733, "learning_rate": 4.268496420047732e-07, "logits/chosen": 0.7480889558792114, "logits/rejected": 0.6404839754104614, "logps/accuracies": 0.75, "logps/chosen": -348.94525146484375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -275.64013671875, "logps/ref_rejected": -239.08200073242188, "logps/rejected": -421.51513671875, "loss": 0.0891, "rewards/accuracies": 0.75, "rewards/chosen": -3.665255546569824, "rewards/grad_term": 0.008442794904112816, "rewards/margins": 5.456399917602539, "rewards/rejected": -9.121655464172363, "step": 422 }, { "epoch": 0.9062667380824853, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 7.335769347864956, "learning_rate": 4.260143198090692e-07, "logits/chosen": 0.6401829123497009, "logits/rejected": 0.7815831899642944, "logps/accuracies": 0.5, "logps/chosen": -379.6376647949219, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -265.36968994140625, "logps/ref_rejected": -255.16949462890625, "logps/rejected": -414.13201904296875, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": -5.713399410247803, "rewards/grad_term": 0.007248271256685257, "rewards/margins": 2.2347278594970703, "rewards/rejected": -7.948126792907715, "step": 423 }, { "epoch": 0.9084092126405999, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 9.129137286638404, "learning_rate": 4.2517899761336516e-07, "logits/chosen": 0.6732430458068848, "logits/rejected": 0.5181256532669067, "logps/accuracies": 0.25, "logps/chosen": -605.279541015625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -473.38079833984375, "logps/ref_rejected": -406.4195861816406, "logps/rejected": -641.9754638671875, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": -6.594937801361084, "rewards/grad_term": 0.0023317153099924326, "rewards/margins": 5.182857513427734, "rewards/rejected": -11.777795791625977, "step": 424 }, { "epoch": 0.9105516871987145, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 14.734996244623405, "learning_rate": 4.243436754176611e-07, "logits/chosen": 0.48378825187683105, "logits/rejected": 0.3249673843383789, "logps/accuracies": 0.75, "logps/chosen": -461.893310546875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -390.8454895019531, "logps/ref_rejected": -320.3680419921875, "logps/rejected": -476.0527038574219, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": -3.552391290664673, "rewards/grad_term": 0.004285210277885199, "rewards/margins": 4.231842041015625, "rewards/rejected": -7.784233093261719, "step": 425 }, { "epoch": 0.9126941617568292, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.389398153896957, "learning_rate": 4.23508353221957e-07, "logits/chosen": 0.7646510004997253, "logits/rejected": 0.669806957244873, "logps/accuracies": 1.0, "logps/chosen": -448.4755859375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -365.5307922363281, "logps/ref_rejected": -346.1925354003906, "logps/rejected": -590.5164184570312, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": -4.147238254547119, "rewards/grad_term": 0.00027649421826936305, "rewards/margins": 8.068957328796387, "rewards/rejected": -12.216196060180664, "step": 426 }, { "epoch": 0.9148366363149437, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.370091633753265, "learning_rate": 4.2267303102625293e-07, "logits/chosen": 0.7482293844223022, "logits/rejected": 0.6367194652557373, "logps/accuracies": 1.0, "logps/chosen": -521.5593872070312, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -416.5623779296875, "logps/ref_rejected": -381.7167663574219, "logps/rejected": -664.441162109375, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": -5.249850273132324, "rewards/grad_term": 0.0006000981666147709, "rewards/margins": 8.886370658874512, "rewards/rejected": -14.136220932006836, "step": 427 }, { "epoch": 0.9169791108730584, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.038400084548858, "learning_rate": 4.218377088305489e-07, "logits/chosen": 0.8124886751174927, "logits/rejected": 0.47504645586013794, "logps/accuracies": 0.25, "logps/chosen": -516.41015625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -438.93255615234375, "logps/ref_rejected": -273.8423156738281, "logps/rejected": -499.37445068359375, "loss": 0.145, "rewards/accuracies": 1.0, "rewards/chosen": -3.8738789558410645, "rewards/grad_term": 0.0022317732218652964, "rewards/margins": 7.402729034423828, "rewards/rejected": -11.276607513427734, "step": 428 }, { "epoch": 0.919121585431173, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.572014319270594, "learning_rate": 4.210023866348449e-07, "logits/chosen": 0.6831299066543579, "logits/rejected": 0.6394398212432861, "logps/accuracies": 1.0, "logps/chosen": -486.3926086425781, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -410.18096923828125, "logps/ref_rejected": -368.5516357421875, "logps/rejected": -578.9801635742188, "loss": 0.1459, "rewards/accuracies": 1.0, "rewards/chosen": -3.8105835914611816, "rewards/grad_term": 0.00010745471809059381, "rewards/margins": 6.710842609405518, "rewards/rejected": -10.521427154541016, "step": 429 }, { "epoch": 0.9212640599892876, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.689438797952636, "learning_rate": 4.2016706443914076e-07, "logits/chosen": 0.7243601083755493, "logits/rejected": 0.6393451690673828, "logps/accuracies": 1.0, "logps/chosen": -562.4426879882812, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -483.3052978515625, "logps/ref_rejected": -426.74920654296875, "logps/rejected": -627.9085693359375, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": -3.9568703174591064, "rewards/grad_term": 0.000488718505948782, "rewards/margins": 6.101097106933594, "rewards/rejected": -10.057967185974121, "step": 430 }, { "epoch": 0.9234065345474023, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 5.8721575355892615, "learning_rate": 4.1933174224343674e-07, "logits/chosen": 0.7541458010673523, "logits/rejected": 0.6002135276794434, "logps/accuracies": 1.0, "logps/chosen": -398.8825378417969, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -333.1788330078125, "logps/ref_rejected": -287.464599609375, "logps/rejected": -493.71929931640625, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": -3.285184860229492, "rewards/grad_term": 0.0002467437880113721, "rewards/margins": 7.027547359466553, "rewards/rejected": -10.312731742858887, "step": 431 }, { "epoch": 0.9255490091055169, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 5.73075697487602, "learning_rate": 4.184964200477327e-07, "logits/chosen": 0.5785739421844482, "logits/rejected": 0.6086280345916748, "logps/accuracies": 1.0, "logps/chosen": -356.680419921875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -259.8116149902344, "logps/ref_rejected": -233.67578125, "logps/rejected": -508.74652099609375, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": -4.843441009521484, "rewards/grad_term": 0.003247784450650215, "rewards/margins": 8.91009521484375, "rewards/rejected": -13.75353717803955, "step": 432 }, { "epoch": 0.9276914836636315, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.580143703823092, "learning_rate": 4.176610978520286e-07, "logits/chosen": 0.5819364190101624, "logits/rejected": 0.6432737708091736, "logps/accuracies": 0.75, "logps/chosen": -228.54739379882812, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -149.87022399902344, "logps/ref_rejected": -146.04185485839844, "logps/rejected": -256.3289489746094, "loss": 0.1975, "rewards/accuracies": 0.75, "rewards/chosen": -3.933859348297119, "rewards/grad_term": 0.014459378086030483, "rewards/margins": 1.580496072769165, "rewards/rejected": -5.514355659484863, "step": 433 }, { "epoch": 0.9298339582217461, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.738131094632276, "learning_rate": 4.1682577565632457e-07, "logits/chosen": 0.6606433391571045, "logits/rejected": 0.7345594167709351, "logps/accuracies": 0.75, "logps/chosen": -257.4786376953125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -191.63543701171875, "logps/ref_rejected": -178.1594696044922, "logps/rejected": -321.8557434082031, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": -3.2921600341796875, "rewards/grad_term": 0.003366851480677724, "rewards/margins": 3.8926539421081543, "rewards/rejected": -7.184814453125, "step": 434 }, { "epoch": 0.9319764327798608, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.358516114828291, "learning_rate": 4.159904534606205e-07, "logits/chosen": 0.7316790819168091, "logits/rejected": 0.4060133099555969, "logps/accuracies": 0.75, "logps/chosen": -300.3427734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -247.51315307617188, "logps/ref_rejected": -180.51429748535156, "logps/rejected": -308.9923400878906, "loss": 0.1218, "rewards/accuracies": 0.75, "rewards/chosen": -2.6414809226989746, "rewards/grad_term": 0.010488856583833694, "rewards/margins": 3.7824203968048096, "rewards/rejected": -6.423901081085205, "step": 435 }, { "epoch": 0.9341189073379753, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.691996285863658, "learning_rate": 4.151551312649165e-07, "logits/chosen": 0.7785161733627319, "logits/rejected": 0.6878386735916138, "logps/accuracies": 0.75, "logps/chosen": -333.62005615234375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -292.43572998046875, "logps/ref_rejected": -241.63511657714844, "logps/rejected": -372.54913330078125, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": -2.059215545654297, "rewards/grad_term": 0.0018129110103473067, "rewards/margins": 4.486485481262207, "rewards/rejected": -6.545701026916504, "step": 436 }, { "epoch": 0.93626138189609, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 6.403058396602235, "learning_rate": 4.1431980906921235e-07, "logits/chosen": 0.6294098496437073, "logits/rejected": 0.4260585904121399, "logps/accuracies": 1.0, "logps/chosen": -306.869384765625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -250.37619018554688, "logps/ref_rejected": -255.60894775390625, "logps/rejected": -436.3756103515625, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/chosen": -2.824659824371338, "rewards/grad_term": 0.0016512804431840777, "rewards/margins": 6.2136712074279785, "rewards/rejected": -9.038331031799316, "step": 437 }, { "epoch": 0.9384038564542047, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 10.33335056458741, "learning_rate": 4.1348448687350833e-07, "logits/chosen": 0.5839954018592834, "logits/rejected": 0.5881964564323425, "logps/accuracies": 0.75, "logps/chosen": -430.4659118652344, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -370.88543701171875, "logps/ref_rejected": -322.64886474609375, "logps/rejected": -474.2005615234375, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": -2.9790241718292236, "rewards/grad_term": 0.003231536131352186, "rewards/margins": 4.598560810089111, "rewards/rejected": -7.577584743499756, "step": 438 }, { "epoch": 0.9405463310123192, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.524534921674278, "learning_rate": 4.126491646778043e-07, "logits/chosen": 0.8089597225189209, "logits/rejected": 0.6896387338638306, "logps/accuracies": 1.0, "logps/chosen": -643.2845458984375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -516.3448486328125, "logps/ref_rejected": -450.1552734375, "logps/rejected": -712.5276489257812, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": -6.346982955932617, "rewards/grad_term": 0.0003070410166401416, "rewards/margins": 6.771636486053467, "rewards/rejected": -13.118619918823242, "step": 439 }, { "epoch": 0.9426888055704339, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.35386030961725, "learning_rate": 4.118138424821002e-07, "logits/chosen": 0.4550638198852539, "logits/rejected": 0.11324800550937653, "logps/accuracies": 0.75, "logps/chosen": -256.8120422363281, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -193.059326171875, "logps/ref_rejected": -195.65859985351562, "logps/rejected": -337.16546630859375, "loss": 0.1343, "rewards/accuracies": 1.0, "rewards/chosen": -3.1876347064971924, "rewards/grad_term": 0.007398877292871475, "rewards/margins": 3.8877077102661133, "rewards/rejected": -7.075342655181885, "step": 440 }, { "epoch": 0.9448312801285484, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 11.730894891356659, "learning_rate": 4.1097852028639616e-07, "logits/chosen": 0.7687960267066956, "logits/rejected": 0.6161290407180786, "logps/accuracies": 1.0, "logps/chosen": -429.12823486328125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -342.45672607421875, "logps/ref_rejected": -322.35748291015625, "logps/rejected": -519.225341796875, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": -4.3335771560668945, "rewards/grad_term": 0.0011160215362906456, "rewards/margins": 5.5098161697387695, "rewards/rejected": -9.843393325805664, "step": 441 }, { "epoch": 0.9469737546866631, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.1151510120037, "learning_rate": 4.1014319809069213e-07, "logits/chosen": 0.5211978554725647, "logits/rejected": 0.4055239260196686, "logps/accuracies": 0.75, "logps/chosen": -252.66140747070312, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -219.652099609375, "logps/ref_rejected": -156.125244140625, "logps/rejected": -275.7467956542969, "loss": 0.1075, "rewards/accuracies": 1.0, "rewards/chosen": -1.6504652500152588, "rewards/grad_term": 0.005798153579235077, "rewards/margins": 4.330613136291504, "rewards/rejected": -5.981078624725342, "step": 442 }, { "epoch": 0.9491162292447777, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.643922368612464, "learning_rate": 4.0930787589498806e-07, "logits/chosen": 0.7781935930252075, "logits/rejected": 0.3824615180492401, "logps/accuracies": 0.75, "logps/chosen": -381.4145202636719, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -311.0098571777344, "logps/ref_rejected": -217.48388671875, "logps/rejected": -411.19970703125, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": -3.520233631134033, "rewards/grad_term": 0.00042740529170259833, "rewards/margins": 6.1655592918396, "rewards/rejected": -9.685792922973633, "step": 443 }, { "epoch": 0.9512587038028923, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.91825980199573, "learning_rate": 4.08472553699284e-07, "logits/chosen": 0.6866825819015503, "logits/rejected": 0.6368831992149353, "logps/accuracies": 0.5, "logps/chosen": -438.6324462890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -397.3633728027344, "logps/ref_rejected": -329.361572265625, "logps/rejected": -486.6770324707031, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": -2.06345272064209, "rewards/grad_term": 0.0025374325923621655, "rewards/margins": 5.80232048034668, "rewards/rejected": -7.865772724151611, "step": 444 }, { "epoch": 0.953401178361007, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 5.821241307927092, "learning_rate": 4.076372315035799e-07, "logits/chosen": 0.8077370524406433, "logits/rejected": 0.5723898410797119, "logps/accuracies": 0.75, "logps/chosen": -629.1956787109375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -488.9486389160156, "logps/ref_rejected": -358.0147705078125, "logps/rejected": -686.6176147460938, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": -7.012351036071777, "rewards/grad_term": 8.414402145717759e-06, "rewards/margins": 9.417790412902832, "rewards/rejected": -16.430139541625977, "step": 445 }, { "epoch": 0.9555436529191216, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.65211568963401, "learning_rate": 4.068019093078759e-07, "logits/chosen": 0.7759550213813782, "logits/rejected": 0.7140344977378845, "logps/accuracies": 1.0, "logps/chosen": -505.0233459472656, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -406.8385009765625, "logps/ref_rejected": -350.51312255859375, "logps/rejected": -550.5768432617188, "loss": 0.1625, "rewards/accuracies": 1.0, "rewards/chosen": -4.909243106842041, "rewards/grad_term": 0.0008991943905130029, "rewards/margins": 5.0939459800720215, "rewards/rejected": -10.003189086914062, "step": 446 }, { "epoch": 0.9576861274772362, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.561770484711239, "learning_rate": 4.0596658711217187e-07, "logits/chosen": 0.761772632598877, "logits/rejected": 0.5069707632064819, "logps/accuracies": 0.75, "logps/chosen": -414.2353210449219, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -330.43389892578125, "logps/ref_rejected": -286.1955261230469, "logps/rejected": -513.2139892578125, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": -4.190073490142822, "rewards/grad_term": 0.0025082218926399946, "rewards/margins": 7.160850524902344, "rewards/rejected": -11.350923538208008, "step": 447 }, { "epoch": 0.9598286020353508, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 11.50375604154419, "learning_rate": 4.0513126491646774e-07, "logits/chosen": 0.7668182849884033, "logits/rejected": 0.6712717413902283, "logps/accuracies": 0.75, "logps/chosen": -451.71124267578125, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -316.8441467285156, "logps/ref_rejected": -260.1050720214844, "logps/rejected": -519.7904052734375, "loss": 0.1387, "rewards/accuracies": 1.0, "rewards/chosen": -6.743355751037598, "rewards/grad_term": 0.0019724913872778416, "rewards/margins": 6.240912437438965, "rewards/rejected": -12.984268188476562, "step": 448 }, { "epoch": 0.9598286020353508, "eval_flips/correct->correct": 0.14000000059604645, "eval_flips/correct->incorrect": 0.019999999552965164, "eval_flips/incorrect->correct": 0.5799999833106995, "eval_flips/incorrect->incorrect": 0.25999999046325684, "eval_logits/chosen": 0.6544824838638306, "eval_logits/rejected": 0.5577883720397949, "eval_logps/accuracies": 0.7200000286102295, "eval_logps/chosen": -392.440185546875, "eval_logps/ref_accuracies": 0.1599999964237213, "eval_logps/ref_chosen": -323.51568603515625, "eval_logps/ref_rejected": -258.70098876953125, "eval_logps/rejected": -422.9569091796875, "eval_loss": 0.13714653253555298, "eval_rewards/accuracies": 0.9200000166893005, "eval_rewards/chosen": -3.446227788925171, "eval_rewards/grad_term": 0.006565955467522144, "eval_rewards/margins": 4.766568660736084, "eval_rewards/rejected": -8.212796211242676, "eval_runtime": 373.2435, "eval_samples_per_second": 4.233, "eval_steps_per_second": 0.134, "step": 448 }, { "epoch": 0.9619710765934655, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.75, "grad_norm": 7.143313955724084, "learning_rate": 4.042959427207637e-07, "logits/chosen": 0.6099262833595276, "logits/rejected": 0.41160258650779724, "logps/accuracies": 0.25, "logps/chosen": -495.52197265625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -402.29571533203125, "logps/ref_rejected": -238.91366577148438, "logps/rejected": -411.18255615234375, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": -4.661313056945801, "rewards/grad_term": 0.008577575907111168, "rewards/margins": 3.9521327018737793, "rewards/rejected": -8.613445281982422, "step": 449 }, { "epoch": 0.96411355115158, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.099073379071028, "learning_rate": 4.0346062052505964e-07, "logits/chosen": 0.7317103147506714, "logits/rejected": 0.5711554884910583, "logps/accuracies": 0.75, "logps/chosen": -509.8515319824219, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -422.57989501953125, "logps/ref_rejected": -335.5193176269531, "logps/rejected": -622.3995971679688, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": -4.363581657409668, "rewards/grad_term": 4.9737202061805874e-05, "rewards/margins": 9.980432510375977, "rewards/rejected": -14.344014167785645, "step": 450 }, { "epoch": 0.9662560257096947, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.501700427807891, "learning_rate": 4.0262529832935557e-07, "logits/chosen": 0.7478474974632263, "logits/rejected": 0.6434618234634399, "logps/accuracies": 1.0, "logps/chosen": -299.7742004394531, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -228.0120849609375, "logps/ref_rejected": -183.03488159179688, "logps/rejected": -342.20123291015625, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": -3.588106632232666, "rewards/grad_term": 0.003840662771835923, "rewards/margins": 4.370211124420166, "rewards/rejected": -7.958317279815674, "step": 451 }, { "epoch": 0.9683985002678093, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.391268423681992, "learning_rate": 4.017899761336515e-07, "logits/chosen": 0.4842509627342224, "logits/rejected": 0.3438522517681122, "logps/accuracies": 0.5, "logps/chosen": -378.8311767578125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -306.450439453125, "logps/ref_rejected": -239.88665771484375, "logps/rejected": -436.4774475097656, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": -3.61903715133667, "rewards/grad_term": 0.0038894114550203085, "rewards/margins": 6.210503578186035, "rewards/rejected": -9.829540252685547, "step": 452 }, { "epoch": 0.9705409748259239, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 13.121406224741564, "learning_rate": 4.0095465393794747e-07, "logits/chosen": 0.7076852321624756, "logits/rejected": 0.5975925922393799, "logps/accuracies": 0.75, "logps/chosen": -533.9036254882812, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -425.0966796875, "logps/ref_rejected": -362.5286560058594, "logps/rejected": -631.1089477539062, "loss": 0.162, "rewards/accuracies": 1.0, "rewards/chosen": -5.4403462409973145, "rewards/grad_term": 7.804081542417407e-05, "rewards/margins": 7.988667011260986, "rewards/rejected": -13.4290132522583, "step": 453 }, { "epoch": 0.9726834493840386, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 11.264528709557153, "learning_rate": 4.0011933174224345e-07, "logits/chosen": 0.5773014426231384, "logits/rejected": 0.37737879157066345, "logps/accuracies": 0.5, "logps/chosen": -441.2142639160156, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -361.462890625, "logps/ref_rejected": -243.72628784179688, "logps/rejected": -388.4267272949219, "loss": 0.1549, "rewards/accuracies": 1.0, "rewards/chosen": -3.9875683784484863, "rewards/grad_term": 0.0044038868509233, "rewards/margins": 3.2474539279937744, "rewards/rejected": -7.23502254486084, "step": 454 }, { "epoch": 0.9748259239421532, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 17.35451032981818, "learning_rate": 3.992840095465393e-07, "logits/chosen": 0.7966763377189636, "logits/rejected": 0.6966639161109924, "logps/accuracies": 0.75, "logps/chosen": -576.56640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -470.6474304199219, "logps/ref_rejected": -458.44146728515625, "logps/rejected": -663.11572265625, "loss": 0.2188, "rewards/accuracies": 1.0, "rewards/chosen": -5.295950889587402, "rewards/grad_term": 0.0004940250655636191, "rewards/margins": 4.937762260437012, "rewards/rejected": -10.233713150024414, "step": 455 }, { "epoch": 0.9769683985002678, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.682054456108627, "learning_rate": 3.984486873508353e-07, "logits/chosen": 0.9120834469795227, "logits/rejected": 0.6336569786071777, "logps/accuracies": 1.0, "logps/chosen": -432.57769775390625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -369.40673828125, "logps/ref_rejected": -295.18524169921875, "logps/rejected": -496.9414978027344, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": -3.1585474014282227, "rewards/grad_term": 8.77337297424674e-05, "rewards/margins": 6.929265022277832, "rewards/rejected": -10.087812423706055, "step": 456 }, { "epoch": 0.9791108730583824, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.798800443633192, "learning_rate": 3.976133651551313e-07, "logits/chosen": 0.7685093879699707, "logits/rejected": 0.6625791788101196, "logps/accuracies": 0.75, "logps/chosen": -364.49139404296875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -279.5849609375, "logps/ref_rejected": -255.69351196289062, "logps/rejected": -443.4805908203125, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": -4.245321750640869, "rewards/grad_term": 0.007468358147889376, "rewards/margins": 5.144031524658203, "rewards/rejected": -9.389352798461914, "step": 457 }, { "epoch": 0.9812533476164971, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 10.393895759274342, "learning_rate": 3.967780429594272e-07, "logits/chosen": 0.7607497572898865, "logits/rejected": 0.7056368589401245, "logps/accuracies": 1.0, "logps/chosen": -333.776123046875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -276.8760986328125, "logps/ref_rejected": -238.05545043945312, "logps/rejected": -385.234375, "loss": 0.1466, "rewards/accuracies": 0.75, "rewards/chosen": -2.8449995517730713, "rewards/grad_term": 0.010424750857055187, "rewards/margins": 4.513948917388916, "rewards/rejected": -7.358948230743408, "step": 458 }, { "epoch": 0.9833958221746116, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 13.14222406723579, "learning_rate": 3.9594272076372313e-07, "logits/chosen": 0.5317684412002563, "logits/rejected": 0.3798009753227234, "logps/accuracies": 0.5, "logps/chosen": -388.8833923339844, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -284.9061584472656, "logps/ref_rejected": -228.88584899902344, "logps/rejected": -419.9346923828125, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": -5.1988606452941895, "rewards/grad_term": 0.006651153787970543, "rewards/margins": 4.35358190536499, "rewards/rejected": -9.55244255065918, "step": 459 }, { "epoch": 0.9855382967327263, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 9.290117062157519, "learning_rate": 3.9510739856801906e-07, "logits/chosen": 0.7666717171669006, "logits/rejected": 0.6038914918899536, "logps/accuracies": 0.75, "logps/chosen": -517.7335205078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -393.77362060546875, "logps/ref_rejected": -367.8737487792969, "logps/rejected": -612.9761962890625, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": -6.197994232177734, "rewards/grad_term": 0.002154412679374218, "rewards/margins": 6.057126045227051, "rewards/rejected": -12.255121231079102, "step": 460 }, { "epoch": 0.987680771290841, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 8.418543548161525, "learning_rate": 3.9427207637231504e-07, "logits/chosen": 0.4025576114654541, "logits/rejected": 0.24721147119998932, "logps/accuracies": 0.75, "logps/chosen": -312.319091796875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -264.7703857421875, "logps/ref_rejected": -213.37884521484375, "logps/rejected": -414.29150390625, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": -2.3774335384368896, "rewards/grad_term": 0.002074267016723752, "rewards/margins": 7.668199062347412, "rewards/rejected": -10.045632362365723, "step": 461 }, { "epoch": 0.9898232458489555, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.924066455521681, "learning_rate": 3.934367541766109e-07, "logits/chosen": 0.7529336810112, "logits/rejected": 0.5446640253067017, "logps/accuracies": 0.5, "logps/chosen": -391.60406494140625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -341.4349365234375, "logps/ref_rejected": -242.84326171875, "logps/rejected": -389.7578125, "loss": 0.0904, "rewards/accuracies": 0.75, "rewards/chosen": -2.5084564685821533, "rewards/grad_term": 0.0086339320987463, "rewards/margins": 4.837271690368652, "rewards/rejected": -7.345727920532227, "step": 462 }, { "epoch": 0.9919657204070702, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.341264657659469, "learning_rate": 3.926014319809069e-07, "logits/chosen": 0.5941088199615479, "logits/rejected": 0.5042127370834351, "logps/accuracies": 1.0, "logps/chosen": -501.0787353515625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -397.529541015625, "logps/ref_rejected": -341.83441162109375, "logps/rejected": -633.3097534179688, "loss": 0.1559, "rewards/accuracies": 1.0, "rewards/chosen": -5.177460193634033, "rewards/grad_term": 4.0835613617673516e-05, "rewards/margins": 9.396307945251465, "rewards/rejected": -14.573768615722656, "step": 463 }, { "epoch": 0.9941081949651848, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.207024818309272, "learning_rate": 3.9176610978520286e-07, "logits/chosen": 0.3791959285736084, "logits/rejected": 0.4412694573402405, "logps/accuracies": 1.0, "logps/chosen": -288.90289306640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -218.5885772705078, "logps/ref_rejected": -224.43006896972656, "logps/rejected": -435.8794860839844, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": -3.5157151222229004, "rewards/grad_term": 0.0030132310930639505, "rewards/margins": 7.056755065917969, "rewards/rejected": -10.572470664978027, "step": 464 }, { "epoch": 0.9962506695232994, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.607745217245667, "learning_rate": 3.9093078758949884e-07, "logits/chosen": 0.5457050800323486, "logits/rejected": 0.5041043162345886, "logps/accuracies": 1.0, "logps/chosen": -283.53179931640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -227.90573120117188, "logps/ref_rejected": -220.95458984375, "logps/rejected": -366.6824951171875, "loss": 0.1246, "rewards/accuracies": 1.0, "rewards/chosen": -2.781303882598877, "rewards/grad_term": 0.003597520524635911, "rewards/margins": 4.50508975982666, "rewards/rejected": -7.286393642425537, "step": 465 }, { "epoch": 0.998393144081414, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 7.293270758501063, "learning_rate": 3.900954653937947e-07, "logits/chosen": 0.6813356280326843, "logits/rejected": 0.6149817109107971, "logps/accuracies": 0.75, "logps/chosen": -369.541748046875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -312.1705017089844, "logps/ref_rejected": -293.1113586425781, "logps/rejected": -475.52325439453125, "loss": 0.1107, "rewards/accuracies": 1.0, "rewards/chosen": -2.8685624599456787, "rewards/grad_term": 0.005431050434708595, "rewards/margins": 6.25203275680542, "rewards/rejected": -9.120595932006836, "step": 466 }, { "epoch": 1.0005356186395287, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 7.806449837480457, "learning_rate": 3.892601431980907e-07, "logits/chosen": 0.5919859409332275, "logits/rejected": 0.5162959694862366, "logps/accuracies": 1.0, "logps/chosen": -398.9145812988281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -307.1631774902344, "logps/ref_rejected": -271.4671630859375, "logps/rejected": -484.1536865234375, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": -4.5875701904296875, "rewards/grad_term": 0.0008761522476561368, "rewards/margins": 6.046757221221924, "rewards/rejected": -10.63432788848877, "step": 467 }, { "epoch": 1.0026780931976433, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 3.522329323680681, "learning_rate": 3.884248210023866e-07, "logits/chosen": 0.6553293466567993, "logits/rejected": 0.5018194913864136, "logps/accuracies": 0.75, "logps/chosen": -324.5142517089844, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -261.5387878417969, "logps/ref_rejected": -222.56907653808594, "logps/rejected": -426.75311279296875, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": -3.1487741470336914, "rewards/grad_term": 0.0020364022348076105, "rewards/margins": 7.060428619384766, "rewards/rejected": -10.20920181274414, "step": 468 }, { "epoch": 1.004820567755758, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.5, "grad_norm": 6.612470098819514, "learning_rate": 3.8758949880668254e-07, "logits/chosen": 0.6158171892166138, "logits/rejected": 0.5050429105758667, "logps/accuracies": 0.5, "logps/chosen": -406.8590087890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -289.06756591796875, "logps/ref_rejected": -229.218994140625, "logps/rejected": -454.9029846191406, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": -5.889570713043213, "rewards/grad_term": 0.0034782271832227707, "rewards/margins": 5.39462947845459, "rewards/rejected": -11.284200668334961, "step": 469 }, { "epoch": 1.0069630423138725, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 3.539220855844499, "learning_rate": 3.8675417661097847e-07, "logits/chosen": 0.7073222994804382, "logits/rejected": 0.671281635761261, "logps/accuracies": 0.75, "logps/chosen": -430.43096923828125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -342.07391357421875, "logps/ref_rejected": -324.9189453125, "logps/rejected": -591.3267211914062, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -4.417852401733398, "rewards/grad_term": 0.00558136124163866, "rewards/margins": 8.902535438537598, "rewards/rejected": -13.320388793945312, "step": 470 }, { "epoch": 1.0091055168719871, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.25, "grad_norm": 6.354661123726489, "learning_rate": 3.8591885441527445e-07, "logits/chosen": 0.7619997262954712, "logits/rejected": 0.706304669380188, "logps/accuracies": 0.75, "logps/chosen": -384.338623046875, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -320.0787658691406, "logps/ref_rejected": -228.5266876220703, "logps/rejected": -378.6219482421875, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/chosen": -3.212991237640381, "rewards/grad_term": 0.0007301281439140439, "rewards/margins": 4.291770935058594, "rewards/rejected": -7.504762649536133, "step": 471 }, { "epoch": 1.0112479914301018, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 4.790880655224275, "learning_rate": 3.8508353221957043e-07, "logits/chosen": 0.7498035430908203, "logits/rejected": 0.518231987953186, "logps/accuracies": 0.75, "logps/chosen": -450.41552734375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -375.4889221191406, "logps/ref_rejected": -315.0843200683594, "logps/rejected": -485.2691650390625, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": -3.7463297843933105, "rewards/grad_term": 0.0010887248208746314, "rewards/margins": 4.762913227081299, "rewards/rejected": -8.50924301147461, "step": 472 }, { "epoch": 1.0133904659882165, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 5.7481678574942885, "learning_rate": 3.842482100238663e-07, "logits/chosen": 0.5814411640167236, "logits/rejected": 0.48274481296539307, "logps/accuracies": 1.0, "logps/chosen": -477.7788391113281, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -356.0843200683594, "logps/ref_rejected": -339.63299560546875, "logps/rejected": -642.8599853515625, "loss": 0.1017, "rewards/accuracies": 1.0, "rewards/chosen": -6.084725856781006, "rewards/grad_term": 1.688380325504113e-05, "rewards/margins": 9.07662296295166, "rewards/rejected": -15.16135025024414, "step": 473 }, { "epoch": 1.015532940546331, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.361744125881987, "learning_rate": 3.834128878281623e-07, "logits/chosen": 0.9029494524002075, "logits/rejected": 0.8422863483428955, "logps/accuracies": 1.0, "logps/chosen": -642.5513916015625, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -501.9047546386719, "logps/ref_rejected": -415.24755859375, "logps/rejected": -698.61962890625, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": -7.0323286056518555, "rewards/grad_term": 0.001934091211296618, "rewards/margins": 7.1362762451171875, "rewards/rejected": -14.16860580444336, "step": 474 }, { "epoch": 1.0176754151044456, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 4.64410207259052, "learning_rate": 3.8257756563245826e-07, "logits/chosen": 0.5650697946548462, "logits/rejected": 0.5173856019973755, "logps/accuracies": 1.0, "logps/chosen": -540.633056640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -457.04205322265625, "logps/ref_rejected": -435.2861633300781, "logps/rejected": -643.3292846679688, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": -4.1795477867126465, "rewards/grad_term": 0.0007545886328443885, "rewards/margins": 6.2226080894470215, "rewards/rejected": -10.402155876159668, "step": 475 }, { "epoch": 1.0198178896625603, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 4.573143939908089, "learning_rate": 3.817422434367542e-07, "logits/chosen": 0.5865468978881836, "logits/rejected": 0.7002366185188293, "logps/accuracies": 0.75, "logps/chosen": -433.32452392578125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -351.2342529296875, "logps/ref_rejected": -321.6819152832031, "logps/rejected": -513.435791015625, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": -4.104512691497803, "rewards/grad_term": 0.003012165194377303, "rewards/margins": 5.483182430267334, "rewards/rejected": -9.587695121765137, "step": 476 }, { "epoch": 1.021960364220675, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0, "grad_norm": 8.11310764037292, "learning_rate": 3.809069212410501e-07, "logits/chosen": 0.4245867133140564, "logits/rejected": 0.4632042348384857, "logps/accuracies": 1.0, "logps/chosen": -426.9747314453125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -358.86248779296875, "logps/ref_rejected": -292.428466796875, "logps/rejected": -462.8052673339844, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": -3.4056124687194824, "rewards/grad_term": 0.001515088020823896, "rewards/margins": 5.113227844238281, "rewards/rejected": -8.518840789794922, "step": 477 }, { "epoch": 1.0241028387787896, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.5, "grad_norm": 4.791373581896772, "learning_rate": 3.8007159904534603e-07, "logits/chosen": 0.5890440940856934, "logits/rejected": 0.36139726638793945, "logps/accuracies": 0.5, "logps/chosen": -490.9143371582031, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -361.0188903808594, "logps/ref_rejected": -289.797119140625, "logps/rejected": -517.291259765625, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": -6.494773864746094, "rewards/grad_term": 0.0016336999833583832, "rewards/margins": 4.879931449890137, "rewards/rejected": -11.37470531463623, "step": 478 }, { "epoch": 1.026245313336904, "flips/correct->correct": 0.0, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 1.0, "flips/incorrect->incorrect": 0.0, "grad_norm": 9.419742623980941, "learning_rate": 3.79236276849642e-07, "logits/chosen": 0.6677709817886353, "logits/rejected": 0.6186258792877197, "logps/accuracies": 1.0, "logps/chosen": -357.1605224609375, "logps/ref_accuracies": 0.0, "logps/ref_chosen": -280.62677001953125, "logps/ref_rejected": -218.89462280273438, "logps/rejected": -433.9444274902344, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": -3.826688766479492, "rewards/grad_term": 0.004192608408629894, "rewards/margins": 6.925801753997803, "rewards/rejected": -10.752490043640137, "step": 479 }, { "epoch": 1.0283877878950187, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5, "grad_norm": 5.500558735593387, "learning_rate": 3.784009546539379e-07, "logits/chosen": 0.5895904302597046, "logits/rejected": 0.5445250868797302, "logps/accuracies": 0.5, "logps/chosen": -155.37521362304688, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -109.69457244873047, "logps/ref_rejected": -98.66912078857422, "logps/rejected": -187.35279846191406, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": -2.284031867980957, "rewards/grad_term": 0.0074999695643782616, "rewards/margins": 2.1501517295837402, "rewards/rejected": -4.4341840744018555, "step": 480 } ], "logging_steps": 1, "max_steps": 932, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 96, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }