diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3136 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9982859101816935, + "eval_steps": 0, + "global_step": 182, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005485087418580734, + "grad_norm": 5.038481473898129, + "learning_rate": 5.263157894736843e-07, + "logits/chosen": -0.3854110836982727, + "logits/rejected": -0.38843637704849243, + "logps/chosen": -0.5867404937744141, + "logps/rejected": -0.7349259853363037, + "loss": 0.8549, + "odds_ratio_loss": 8.495767593383789, + "rewards/accuracies": 0.328125, + "rewards/chosen": -0.07349259406328201, + "rewards/margins": -0.014818555675446987, + "rewards/rejected": -0.05867404490709305, + "sft_loss": 0.00530597660690546, + "step": 1 + }, + { + "epoch": 0.010970174837161468, + "grad_norm": 3.8692149987333275, + "learning_rate": 1.0526315789473685e-06, + "logits/chosen": -0.4200110137462616, + "logits/rejected": -0.4337027370929718, + "logps/chosen": -0.5888247489929199, + "logps/rejected": -0.7141146659851074, + "loss": 0.8261, + "odds_ratio_loss": 8.218369483947754, + "rewards/accuracies": 0.3671875, + "rewards/chosen": -0.07141146808862686, + "rewards/margins": -0.01252899132668972, + "rewards/rejected": -0.05888247489929199, + "sft_loss": 0.004305172245949507, + "step": 2 + }, + { + "epoch": 0.0164552622557422, + "grad_norm": 3.4352579088349455, + "learning_rate": 1.5789473684210526e-06, + "logits/chosen": -0.39038556814193726, + "logits/rejected": -0.3625536262989044, + "logps/chosen": -0.6846470236778259, + "logps/rejected": -0.6906213760375977, + "loss": 0.8054, + "odds_ratio_loss": 8.022075653076172, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06906213611364365, + "rewards/margins": -0.0005974352825433016, + "rewards/rejected": -0.06846471130847931, + "sft_loss": 0.0031916582956910133, + "step": 3 + }, + { + "epoch": 0.021940349674322936, + "grad_norm": 6.165427271910815, + "learning_rate": 2.105263157894737e-06, + "logits/chosen": -0.4156845211982727, + "logits/rejected": -0.4445635974407196, + "logps/chosen": -0.6091395020484924, + "logps/rejected": -0.7165651321411133, + "loss": 0.8329, + "odds_ratio_loss": 8.273240089416504, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.07165651768445969, + "rewards/margins": -0.010742560029029846, + "rewards/rejected": -0.06091395765542984, + "sft_loss": 0.005578206852078438, + "step": 4 + }, + { + "epoch": 0.027425437092903668, + "grad_norm": 3.719991962975026, + "learning_rate": 2.631578947368421e-06, + "logits/chosen": -0.4787546396255493, + "logits/rejected": -0.4570210874080658, + "logps/chosen": -0.5717373490333557, + "logps/rejected": -0.6768375039100647, + "loss": 0.7966, + "odds_ratio_loss": 7.933419227600098, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.06768374890089035, + "rewards/margins": -0.010510011576116085, + "rewards/rejected": -0.05717373639345169, + "sft_loss": 0.0032929559238255024, + "step": 5 + }, + { + "epoch": 0.0329105245114844, + "grad_norm": 3.623178718641896, + "learning_rate": 3.157894736842105e-06, + "logits/chosen": -0.34492218494415283, + "logits/rejected": -0.42876753211021423, + "logps/chosen": -0.5948079824447632, + "logps/rejected": -0.7103158235549927, + "loss": 0.8222, + "odds_ratio_loss": 8.177949905395508, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0710315853357315, + "rewards/margins": -0.01155078411102295, + "rewards/rejected": -0.05948079749941826, + "sft_loss": 0.004363520070910454, + "step": 6 + }, + { + "epoch": 0.03839561193006513, + "grad_norm": 3.845464498598408, + "learning_rate": 3.6842105263157896e-06, + "logits/chosen": -0.465349018573761, + "logits/rejected": -0.5269383788108826, + "logps/chosen": -0.5850939154624939, + "logps/rejected": -0.6428964734077454, + "loss": 0.7481, + "odds_ratio_loss": 7.433981418609619, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.0642896518111229, + "rewards/margins": -0.00578026007860899, + "rewards/rejected": -0.05850938707590103, + "sft_loss": 0.004695751238614321, + "step": 7 + }, + { + "epoch": 0.04388069934864587, + "grad_norm": 3.534994725436119, + "learning_rate": 4.210526315789474e-06, + "logits/chosen": -0.47682783007621765, + "logits/rejected": -0.5616908669471741, + "logps/chosen": -0.5679696798324585, + "logps/rejected": -0.6700828671455383, + "loss": 0.7817, + "odds_ratio_loss": 7.7718095779418945, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06700828671455383, + "rewards/margins": -0.010211320593953133, + "rewards/rejected": -0.05679696798324585, + "sft_loss": 0.004522908478975296, + "step": 8 + }, + { + "epoch": 0.049365786767226603, + "grad_norm": 3.0620529937519723, + "learning_rate": 4.736842105263158e-06, + "logits/chosen": -0.5015044808387756, + "logits/rejected": -0.5388280153274536, + "logps/chosen": -0.4832647740840912, + "logps/rejected": -0.7363763451576233, + "loss": 0.8647, + "odds_ratio_loss": 8.59887981414795, + "rewards/accuracies": 0.265625, + "rewards/chosen": -0.07363763451576233, + "rewards/margins": -0.02531115524470806, + "rewards/rejected": -0.04832647740840912, + "sft_loss": 0.004782060626894236, + "step": 9 + }, + { + "epoch": 0.054850874185807336, + "grad_norm": 3.1485369964141117, + "learning_rate": 5.263157894736842e-06, + "logits/chosen": -0.5910385251045227, + "logits/rejected": -0.5793564319610596, + "logps/chosen": -0.5233993530273438, + "logps/rejected": -0.7609111666679382, + "loss": 0.8793, + "odds_ratio_loss": 8.740400314331055, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.07609111815690994, + "rewards/margins": -0.02375117875635624, + "rewards/rejected": -0.052339933812618256, + "sft_loss": 0.00524852704256773, + "step": 10 + }, + { + "epoch": 0.06033596160438807, + "grad_norm": 4.044788224688044, + "learning_rate": 5.789473684210527e-06, + "logits/chosen": -0.63301682472229, + "logits/rejected": -0.6308025121688843, + "logps/chosen": -0.5794235467910767, + "logps/rejected": -0.7120978832244873, + "loss": 0.8159, + "odds_ratio_loss": 8.079412460327148, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.07120979577302933, + "rewards/margins": -0.013267431408166885, + "rewards/rejected": -0.057942356914281845, + "sft_loss": 0.007982916198670864, + "step": 11 + }, + { + "epoch": 0.0658210490229688, + "grad_norm": 2.667877748800269, + "learning_rate": 6.31578947368421e-06, + "logits/chosen": -0.6804186701774597, + "logits/rejected": -0.669500470161438, + "logps/chosen": -0.6842315196990967, + "logps/rejected": -0.7194308638572693, + "loss": 0.8272, + "odds_ratio_loss": 8.232641220092773, + "rewards/accuracies": 0.296875, + "rewards/chosen": -0.07194308936595917, + "rewards/margins": -0.0035199355334043503, + "rewards/rejected": -0.06842315196990967, + "sft_loss": 0.0039048779290169477, + "step": 12 + }, + { + "epoch": 0.07130613644154954, + "grad_norm": 3.0798648484182354, + "learning_rate": 6.842105263157896e-06, + "logits/chosen": -0.6367232203483582, + "logits/rejected": -0.6232490539550781, + "logps/chosen": -0.5626485347747803, + "logps/rejected": -0.6783699989318848, + "loss": 0.7821, + "odds_ratio_loss": 7.7734174728393555, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.06783699989318848, + "rewards/margins": -0.011572152376174927, + "rewards/rejected": -0.05626484751701355, + "sft_loss": 0.0047150785103440285, + "step": 13 + }, + { + "epoch": 0.07679122386013026, + "grad_norm": 2.5628005096722, + "learning_rate": 7.368421052631579e-06, + "logits/chosen": -0.6715413928031921, + "logits/rejected": -0.6445600390434265, + "logps/chosen": -0.7547533512115479, + "logps/rejected": -0.7515701055526733, + "loss": 0.8562, + "odds_ratio_loss": 8.51838207244873, + "rewards/accuracies": 0.3515625, + "rewards/chosen": -0.07515701651573181, + "rewards/margins": 0.0003183241933584213, + "rewards/rejected": -0.07547533512115479, + "sft_loss": 0.004331222269684076, + "step": 14 + }, + { + "epoch": 0.082276311278711, + "grad_norm": 3.6153906624666243, + "learning_rate": 7.894736842105265e-06, + "logits/chosen": -0.6177899241447449, + "logits/rejected": -0.5948590040206909, + "logps/chosen": -0.733797550201416, + "logps/rejected": -0.7167279124259949, + "loss": 0.8106, + "odds_ratio_loss": 8.030726432800293, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07167279720306396, + "rewards/margins": 0.0017069588648155332, + "rewards/rejected": -0.0733797550201416, + "sft_loss": 0.007539147045463324, + "step": 15 + }, + { + "epoch": 0.08776139869729174, + "grad_norm": 2.9810395185494576, + "learning_rate": 8.421052631578948e-06, + "logits/chosen": -0.6613759994506836, + "logits/rejected": -0.6372989416122437, + "logps/chosen": -0.6857063174247742, + "logps/rejected": -0.7137706875801086, + "loss": 0.8113, + "odds_ratio_loss": 8.056015968322754, + "rewards/accuracies": 0.359375, + "rewards/chosen": -0.07137707620859146, + "rewards/margins": -0.0028064418584108353, + "rewards/rejected": -0.06857062876224518, + "sft_loss": 0.005733313038945198, + "step": 16 + }, + { + "epoch": 0.09324648611587247, + "grad_norm": 2.6891547946091565, + "learning_rate": 8.947368421052632e-06, + "logits/chosen": -0.6225110292434692, + "logits/rejected": -0.6161482930183411, + "logps/chosen": -0.7698332667350769, + "logps/rejected": -0.7336711287498474, + "loss": 0.8323, + "odds_ratio_loss": 8.280593872070312, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.07336711138486862, + "rewards/margins": 0.0036162168253213167, + "rewards/rejected": -0.07698333263397217, + "sft_loss": 0.004251881968230009, + "step": 17 + }, + { + "epoch": 0.09873157353445321, + "grad_norm": 3.1701971147920314, + "learning_rate": 9.473684210526315e-06, + "logits/chosen": -0.6359448432922363, + "logits/rejected": -0.5702680349349976, + "logps/chosen": -0.77215576171875, + "logps/rejected": -0.7223557829856873, + "loss": 0.8107, + "odds_ratio_loss": 8.051921844482422, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.0722355842590332, + "rewards/margins": 0.004979996010661125, + "rewards/rejected": -0.07721558213233948, + "sft_loss": 0.005473591852933168, + "step": 18 + }, + { + "epoch": 0.10421666095303393, + "grad_norm": 5.701499338439259, + "learning_rate": 1e-05, + "logits/chosen": -0.6352053284645081, + "logits/rejected": -0.6071786880493164, + "logps/chosen": -0.6723276376724243, + "logps/rejected": -0.6906519532203674, + "loss": 0.7743, + "odds_ratio_loss": 7.4815754890441895, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06906520575284958, + "rewards/margins": -0.001832429552450776, + "rewards/rejected": -0.06723276525735855, + "sft_loss": 0.026183124631643295, + "step": 19 + }, + { + "epoch": 0.10970174837161467, + "grad_norm": 2.32222317741005, + "learning_rate": 9.999071352056676e-06, + "logits/chosen": -0.6049666404724121, + "logits/rejected": -0.5781897306442261, + "logps/chosen": -0.5833567976951599, + "logps/rejected": -0.688912034034729, + "loss": 0.7844, + "odds_ratio_loss": 7.809441566467285, + "rewards/accuracies": 0.359375, + "rewards/chosen": -0.06889119744300842, + "rewards/margins": -0.01055552251636982, + "rewards/rejected": -0.05833568051457405, + "sft_loss": 0.0034232870675623417, + "step": 20 + }, + { + "epoch": 0.11518683579019541, + "grad_norm": 2.5009890128022625, + "learning_rate": 9.996285753181499e-06, + "logits/chosen": -0.6923652291297913, + "logits/rejected": -0.622920036315918, + "logps/chosen": -0.5788823366165161, + "logps/rejected": -0.7099403738975525, + "loss": 0.8091, + "odds_ratio_loss": 8.047269821166992, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.07099403440952301, + "rewards/margins": -0.013105800375342369, + "rewards/rejected": -0.05788823589682579, + "sft_loss": 0.004356673918664455, + "step": 21 + }, + { + "epoch": 0.12067192320877614, + "grad_norm": 2.504946922805322, + "learning_rate": 9.991644238110741e-06, + "logits/chosen": -0.7856975197792053, + "logits/rejected": -0.6942786574363708, + "logps/chosen": -0.8530550003051758, + "logps/rejected": -0.6810007691383362, + "loss": 0.7769, + "odds_ratio_loss": 7.728884696960449, + "rewards/accuracies": 0.328125, + "rewards/chosen": -0.06810007989406586, + "rewards/margins": 0.017205417156219482, + "rewards/rejected": -0.08530549705028534, + "sft_loss": 0.003965733572840691, + "step": 22 + }, + { + "epoch": 0.12615701062735687, + "grad_norm": 2.3381968971086358, + "learning_rate": 9.985148530977767e-06, + "logits/chosen": -0.7372395992279053, + "logits/rejected": -0.6523022651672363, + "logps/chosen": -0.5776190161705017, + "logps/rejected": -0.6639264822006226, + "loss": 0.7573, + "odds_ratio_loss": 7.541070938110352, + "rewards/accuracies": 0.3203125, + "rewards/chosen": -0.06639264523983002, + "rewards/margins": -0.008630745112895966, + "rewards/rejected": -0.05776190012693405, + "sft_loss": 0.0031744332518428564, + "step": 23 + }, + { + "epoch": 0.1316420980459376, + "grad_norm": 2.210859888036035, + "learning_rate": 9.976801044672608e-06, + "logits/chosen": -0.7757277488708496, + "logits/rejected": -0.6656205654144287, + "logps/chosen": -0.5713773965835571, + "logps/rejected": -0.6431125402450562, + "loss": 0.7376, + "odds_ratio_loss": 7.344100475311279, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06431125849485397, + "rewards/margins": -0.007173515390604734, + "rewards/rejected": -0.057137735188007355, + "sft_loss": 0.0031648115254938602, + "step": 24 + }, + { + "epoch": 0.13712718546451835, + "grad_norm": 2.5797643977876326, + "learning_rate": 9.966604879945659e-06, + "logits/chosen": -0.8338367342948914, + "logits/rejected": -0.7324843406677246, + "logps/chosen": -0.5937628149986267, + "logps/rejected": -0.6450057625770569, + "loss": 0.735, + "odds_ratio_loss": 7.3058085441589355, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.06450057774782181, + "rewards/margins": -0.005124296993017197, + "rewards/rejected": -0.05937628448009491, + "sft_loss": 0.004404762759804726, + "step": 25 + }, + { + "epoch": 0.14261227288309908, + "grad_norm": 2.285207611169885, + "learning_rate": 9.954563824255879e-06, + "logits/chosen": -0.7936639785766602, + "logits/rejected": -0.7150090932846069, + "logps/chosen": -0.5255985856056213, + "logps/rejected": -0.6223771572113037, + "loss": 0.7183, + "odds_ratio_loss": 7.146646499633789, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.062237709760665894, + "rewards/margins": -0.009677855297923088, + "rewards/rejected": -0.052559856325387955, + "sft_loss": 0.0036687427200376987, + "step": 26 + }, + { + "epoch": 0.1480973603016798, + "grad_norm": 2.323478414221198, + "learning_rate": 9.940682350363913e-06, + "logits/chosen": -0.7588051557540894, + "logits/rejected": -0.7106639742851257, + "logps/chosen": -0.5572831630706787, + "logps/rejected": -0.6621856689453125, + "loss": 0.7592, + "odds_ratio_loss": 7.553915977478027, + "rewards/accuracies": 0.328125, + "rewards/chosen": -0.06621856987476349, + "rewards/margins": -0.010490254499018192, + "rewards/rejected": -0.05572831630706787, + "sft_loss": 0.0038430241402238607, + "step": 27 + }, + { + "epoch": 0.15358244772026053, + "grad_norm": 2.2864612903822175, + "learning_rate": 9.924965614670629e-06, + "logits/chosen": -0.7962155342102051, + "logits/rejected": -0.7206164598464966, + "logps/chosen": -0.6034549474716187, + "logps/rejected": -0.6673721075057983, + "loss": 0.7587, + "odds_ratio_loss": 7.547135829925537, + "rewards/accuracies": 0.3828125, + "rewards/chosen": -0.06673721969127655, + "rewards/margins": -0.006391720846295357, + "rewards/rejected": -0.060345496982336044, + "sft_loss": 0.00402811961248517, + "step": 28 + }, + { + "epoch": 0.15906753513884128, + "grad_norm": 5.4563944483821025, + "learning_rate": 9.90741945530174e-06, + "logits/chosen": -0.7886516451835632, + "logits/rejected": -0.7214576005935669, + "logps/chosen": -0.6277047991752625, + "logps/rejected": -0.774495542049408, + "loss": 0.873, + "odds_ratio_loss": 8.486030578613281, + "rewards/accuracies": 0.2890625, + "rewards/chosen": -0.07744955271482468, + "rewards/margins": -0.0146790761500597, + "rewards/rejected": -0.06277047842741013, + "sft_loss": 0.02440182864665985, + "step": 29 + }, + { + "epoch": 0.164552622557422, + "grad_norm": 2.5755938714187216, + "learning_rate": 9.888050389939172e-06, + "logits/chosen": -0.8029124736785889, + "logits/rejected": -0.7149651050567627, + "logps/chosen": -0.6227287650108337, + "logps/rejected": -0.7041884660720825, + "loss": 0.796, + "odds_ratio_loss": 7.916073799133301, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.07041884958744049, + "rewards/margins": -0.008145971223711967, + "rewards/rejected": -0.06227288395166397, + "sft_loss": 0.004434916190803051, + "step": 30 + }, + { + "epoch": 0.17003770997600273, + "grad_norm": 2.738052763928764, + "learning_rate": 9.866865613400008e-06, + "logits/chosen": -0.745469331741333, + "logits/rejected": -0.7005943059921265, + "logps/chosen": -0.635623574256897, + "logps/rejected": -0.706161379814148, + "loss": 0.8005, + "odds_ratio_loss": 7.957401275634766, + "rewards/accuracies": 0.3828125, + "rewards/chosen": -0.07061614096164703, + "rewards/margins": -0.007053782232105732, + "rewards/rejected": -0.06356236338615417, + "sft_loss": 0.0047803232446312904, + "step": 31 + }, + { + "epoch": 0.17552279739458349, + "grad_norm": 2.110256032046137, + "learning_rate": 9.843872994963912e-06, + "logits/chosen": -0.8181653618812561, + "logits/rejected": -0.7452071309089661, + "logps/chosen": -0.5545619130134583, + "logps/rejected": -0.6596910357475281, + "loss": 0.754, + "odds_ratio_loss": 7.508852958679199, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.06596910208463669, + "rewards/margins": -0.010512909851968288, + "rewards/rejected": -0.05545618757605553, + "sft_loss": 0.0030794725753366947, + "step": 32 + }, + { + "epoch": 0.1810078848131642, + "grad_norm": 3.010643924076955, + "learning_rate": 9.819081075450014e-06, + "logits/chosen": -0.7885746359825134, + "logits/rejected": -0.7216675877571106, + "logps/chosen": -0.7158695459365845, + "logps/rejected": -0.6520407199859619, + "loss": 0.7394, + "odds_ratio_loss": 7.32637882232666, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.06520406901836395, + "rewards/margins": 0.006382887717336416, + "rewards/rejected": -0.07158695161342621, + "sft_loss": 0.006795875262469053, + "step": 33 + }, + { + "epoch": 0.18649297223174494, + "grad_norm": 2.476605715920486, + "learning_rate": 9.792499064044343e-06, + "logits/chosen": -0.8410877585411072, + "logits/rejected": -0.7374798059463501, + "logps/chosen": -0.6041279435157776, + "logps/rejected": -0.6848863959312439, + "loss": 0.7765, + "odds_ratio_loss": 7.719598770141602, + "rewards/accuracies": 0.3671875, + "rewards/chosen": -0.06848862767219543, + "rewards/margins": -0.008075837977230549, + "rewards/rejected": -0.060412801802158356, + "sft_loss": 0.00449481513351202, + "step": 34 + }, + { + "epoch": 0.1919780596503257, + "grad_norm": 2.5393725876466284, + "learning_rate": 9.764136834878987e-06, + "logits/chosen": -0.8155717253684998, + "logits/rejected": -0.7687007188796997, + "logps/chosen": -0.6000927686691284, + "logps/rejected": -0.6661568880081177, + "loss": 0.7545, + "odds_ratio_loss": 7.500143051147461, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06661568582057953, + "rewards/margins": -0.006606410723179579, + "rewards/rejected": -0.06000927463173866, + "sft_loss": 0.004530766978859901, + "step": 35 + }, + { + "epoch": 0.19746314706890641, + "grad_norm": 2.5736785638504127, + "learning_rate": 9.734004923364258e-06, + "logits/chosen": -0.7973151803016663, + "logits/rejected": -0.777863085269928, + "logps/chosen": -0.6574283838272095, + "logps/rejected": -0.7205468416213989, + "loss": 0.8155, + "odds_ratio_loss": 8.100136756896973, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0720546767115593, + "rewards/margins": -0.006311837118119001, + "rewards/rejected": -0.06574283540248871, + "sft_loss": 0.0054591940715909, + "step": 36 + }, + { + "epoch": 0.20294823448748714, + "grad_norm": 2.670773258722001, + "learning_rate": 9.702114522275216e-06, + "logits/chosen": -0.7815201878547668, + "logits/rejected": -0.7440929412841797, + "logps/chosen": -0.6075029373168945, + "logps/rejected": -0.7015030384063721, + "loss": 0.7927, + "odds_ratio_loss": 7.880993843078613, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07015030831098557, + "rewards/margins": -0.009400014765560627, + "rewards/rejected": -0.06075029447674751, + "sft_loss": 0.004569103941321373, + "step": 37 + }, + { + "epoch": 0.20843332190606786, + "grad_norm": 4.330660536492607, + "learning_rate": 9.668477477594021e-06, + "logits/chosen": -0.7350383996963501, + "logits/rejected": -0.6684775352478027, + "logps/chosen": -0.6681689023971558, + "logps/rejected": -0.6945660710334778, + "loss": 0.7763, + "odds_ratio_loss": 7.6398186683654785, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06945660710334778, + "rewards/margins": -0.0026397148612886667, + "rewards/rejected": -0.06681689620018005, + "sft_loss": 0.012337452732026577, + "step": 38 + }, + { + "epoch": 0.21391840932464862, + "grad_norm": 2.0952393942157537, + "learning_rate": 9.633106284109612e-06, + "logits/chosen": -0.71222984790802, + "logits/rejected": -0.6697849035263062, + "logps/chosen": -0.5882755517959595, + "logps/rejected": -0.6613237261772156, + "loss": 0.7526, + "odds_ratio_loss": 7.493914604187012, + "rewards/accuracies": 0.3515625, + "rewards/chosen": -0.06613237410783768, + "rewards/margins": -0.007304816506803036, + "rewards/rejected": -0.05882755666971207, + "sft_loss": 0.003193965647369623, + "step": 39 + }, + { + "epoch": 0.21940349674322934, + "grad_norm": 2.5760849251786686, + "learning_rate": 9.596014080776424e-06, + "logits/chosen": -0.7155317068099976, + "logits/rejected": -0.701492428779602, + "logps/chosen": -0.6026738882064819, + "logps/rejected": -0.7281025648117065, + "loss": 0.8228, + "odds_ratio_loss": 8.181192398071289, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.07281024754047394, + "rewards/margins": -0.01254285965114832, + "rewards/rejected": -0.06026739254593849, + "sft_loss": 0.004728448111563921, + "step": 40 + }, + { + "epoch": 0.22488858416181007, + "grad_norm": 3.204343350485489, + "learning_rate": 9.557214645833792e-06, + "logits/chosen": -0.815459668636322, + "logits/rejected": -0.756503701210022, + "logps/chosen": -0.5707991719245911, + "logps/rejected": -0.6623473763465881, + "loss": 0.7528, + "odds_ratio_loss": 7.451156139373779, + "rewards/accuracies": 0.3515625, + "rewards/chosen": -0.06623473763465881, + "rewards/margins": -0.009154818020761013, + "rewards/rejected": -0.057079918682575226, + "sft_loss": 0.0077085429802536964, + "step": 41 + }, + { + "epoch": 0.23037367158039082, + "grad_norm": 2.5773834426506927, + "learning_rate": 9.516722391687903e-06, + "logits/chosen": -0.6963569521903992, + "logits/rejected": -0.6759909391403198, + "logps/chosen": -0.6941713094711304, + "logps/rejected": -0.7426818013191223, + "loss": 0.846, + "odds_ratio_loss": 8.418367385864258, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07426817715167999, + "rewards/margins": -0.004851049743592739, + "rewards/rejected": -0.06941712647676468, + "sft_loss": 0.004144534468650818, + "step": 42 + }, + { + "epoch": 0.23585875899897155, + "grad_norm": 2.2138593444070183, + "learning_rate": 9.474552359558167e-06, + "logits/chosen": -0.7346150279045105, + "logits/rejected": -0.6656796932220459, + "logps/chosen": -0.6170094609260559, + "logps/rejected": -0.6504194736480713, + "loss": 0.736, + "odds_ratio_loss": 7.326251983642578, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06504195928573608, + "rewards/margins": -0.0033410112373530865, + "rewards/rejected": -0.06170094013214111, + "sft_loss": 0.0034048811066895723, + "step": 43 + }, + { + "epoch": 0.24134384641755227, + "grad_norm": 2.4455061811366283, + "learning_rate": 9.43072021389003e-06, + "logits/chosen": -0.7304015159606934, + "logits/rejected": -0.7045088410377502, + "logps/chosen": -0.6213870644569397, + "logps/rejected": -0.6929490566253662, + "loss": 0.7783, + "odds_ratio_loss": 7.74072265625, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06929488480091095, + "rewards/margins": -0.0071561927907168865, + "rewards/rejected": -0.06213871017098427, + "sft_loss": 0.004237705375999212, + "step": 44 + }, + { + "epoch": 0.24682893383613302, + "grad_norm": 2.2689944416738097, + "learning_rate": 9.385242236536259e-06, + "logits/chosen": -0.7978068590164185, + "logits/rejected": -0.7774004936218262, + "logps/chosen": -0.7401710748672485, + "logps/rejected": -0.6718741655349731, + "loss": 0.7556, + "odds_ratio_loss": 7.514792442321777, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.06718742102384567, + "rewards/margins": 0.006829693913459778, + "rewards/rejected": -0.07401710748672485, + "sft_loss": 0.004071537870913744, + "step": 45 + }, + { + "epoch": 0.25231402125471375, + "grad_norm": 2.2496277356274805, + "learning_rate": 9.338135320708912e-06, + "logits/chosen": -0.7474158406257629, + "logits/rejected": -0.6932293176651001, + "logps/chosen": -0.6186713576316833, + "logps/rejected": -0.6508045196533203, + "loss": 0.7338, + "odds_ratio_loss": 7.302546501159668, + "rewards/accuracies": 0.359375, + "rewards/chosen": -0.06508044898509979, + "rewards/margins": -0.003213312244042754, + "rewards/rejected": -0.06186714768409729, + "sft_loss": 0.003513358999043703, + "step": 46 + }, + { + "epoch": 0.2577991086732945, + "grad_norm": 2.229413899291416, + "learning_rate": 9.289416964704186e-06, + "logits/chosen": -0.7597657442092896, + "logits/rejected": -0.7157298922538757, + "logps/chosen": -0.5537360906600952, + "logps/rejected": -0.6344780921936035, + "loss": 0.7248, + "odds_ratio_loss": 7.21159029006958, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.06344781070947647, + "rewards/margins": -0.008074202574789524, + "rewards/rejected": -0.05537361279129982, + "sft_loss": 0.0036083455197513103, + "step": 47 + }, + { + "epoch": 0.2632841960918752, + "grad_norm": 1.8803176092666092, + "learning_rate": 9.239105265402525e-06, + "logits/chosen": -0.727577269077301, + "logits/rejected": -0.70409095287323, + "logps/chosen": -0.5894535779953003, + "logps/rejected": -0.6098184585571289, + "loss": 0.692, + "odds_ratio_loss": 6.8936448097229, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.06098184362053871, + "rewards/margins": -0.002036482561379671, + "rewards/rejected": -0.058945365250110626, + "sft_loss": 0.0026457542553544044, + "step": 48 + }, + { + "epoch": 0.2687692835104559, + "grad_norm": 2.3520901683998137, + "learning_rate": 9.187218911546363e-06, + "logits/chosen": -0.7061797976493835, + "logits/rejected": -0.6750433444976807, + "logps/chosen": -0.6185547709465027, + "logps/rejected": -0.6763335466384888, + "loss": 0.7632, + "odds_ratio_loss": 7.58837890625, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.06763336062431335, + "rewards/margins": -0.005777883809059858, + "rewards/rejected": -0.06185547634959221, + "sft_loss": 0.004371694289147854, + "step": 49 + }, + { + "epoch": 0.2742543709290367, + "grad_norm": 2.3579251994795682, + "learning_rate": 9.133777176798013e-06, + "logits/chosen": -0.6374361515045166, + "logits/rejected": -0.6806906461715698, + "logps/chosen": -0.6513757705688477, + "logps/rejected": -0.6520736813545227, + "loss": 0.7371, + "odds_ratio_loss": 7.324197292327881, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.06520736962556839, + "rewards/margins": -6.9792615249753e-05, + "rewards/rejected": -0.065137580037117, + "sft_loss": 0.004632241558283567, + "step": 50 + }, + { + "epoch": 0.27973945834761743, + "grad_norm": 2.2191064607544746, + "learning_rate": 9.078799912580305e-06, + "logits/chosen": -0.6874204277992249, + "logits/rejected": -0.6352756023406982, + "logps/chosen": -0.6136583685874939, + "logps/rejected": -0.5992326736450195, + "loss": 0.6819, + "odds_ratio_loss": 6.7846455574035645, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.059923265129327774, + "rewards/margins": 0.0014425672125071287, + "rewards/rejected": -0.06136583536863327, + "sft_loss": 0.003475453471764922, + "step": 51 + }, + { + "epoch": 0.28522454576619816, + "grad_norm": 2.3179034735535966, + "learning_rate": 9.022307540702576e-06, + "logits/chosen": -0.7218315005302429, + "logits/rejected": -0.6824548840522766, + "logps/chosen": -0.6579533815383911, + "logps/rejected": -0.688935399055481, + "loss": 0.7754, + "odds_ratio_loss": 7.713038444519043, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.06889353692531586, + "rewards/margins": -0.0030981944873929024, + "rewards/rejected": -0.06579533964395523, + "sft_loss": 0.004128783475607634, + "step": 52 + }, + { + "epoch": 0.2907096331847789, + "grad_norm": 2.422506021035563, + "learning_rate": 8.964321045774808e-06, + "logits/chosen": -0.6635921597480774, + "logits/rejected": -0.6691629886627197, + "logps/chosen": -0.6667566895484924, + "logps/rejected": -0.6598675847053528, + "loss": 0.7415, + "odds_ratio_loss": 7.367385387420654, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.0659867525100708, + "rewards/margins": 0.0006889166543260217, + "rewards/rejected": -0.06667566299438477, + "sft_loss": 0.004781263880431652, + "step": 53 + }, + { + "epoch": 0.2961947206033596, + "grad_norm": 2.3663856409710604, + "learning_rate": 8.904861967412702e-06, + "logits/chosen": -0.729520857334137, + "logits/rejected": -0.721049427986145, + "logps/chosen": -0.6047177314758301, + "logps/rejected": -0.6680710315704346, + "loss": 0.7525, + "odds_ratio_loss": 7.480541706085205, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.0668071061372757, + "rewards/margins": -0.006335328333079815, + "rewards/rejected": -0.060471780598163605, + "sft_loss": 0.004433046095073223, + "step": 54 + }, + { + "epoch": 0.30167980802194033, + "grad_norm": 5.907064022641608, + "learning_rate": 8.843952392236595e-06, + "logits/chosen": -0.7405853867530823, + "logits/rejected": -0.6213741302490234, + "logps/chosen": -0.6097532510757446, + "logps/rejected": -0.7615051865577698, + "loss": 0.8548, + "odds_ratio_loss": 7.123908042907715, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.07615052163600922, + "rewards/margins": -0.015175196342170238, + "rewards/rejected": -0.060975320637226105, + "sft_loss": 0.1424168199300766, + "step": 55 + }, + { + "epoch": 0.30716489544052106, + "grad_norm": 2.3801144562859733, + "learning_rate": 8.78161494566717e-06, + "logits/chosen": -0.7353604435920715, + "logits/rejected": -0.6483398675918579, + "logps/chosen": -0.628172755241394, + "logps/rejected": -0.6272903084754944, + "loss": 0.7044, + "odds_ratio_loss": 7.003999710083008, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.06272903829813004, + "rewards/margins": 8.82376916706562e-05, + "rewards/rejected": -0.0628172755241394, + "sft_loss": 0.004041117150336504, + "step": 56 + }, + { + "epoch": 0.31264998285910184, + "grad_norm": 2.0180451466126073, + "learning_rate": 8.717872783521048e-06, + "logits/chosen": -0.7183849215507507, + "logits/rejected": -0.6541566848754883, + "logps/chosen": -0.7877294421195984, + "logps/rejected": -0.6478090882301331, + "loss": 0.7258, + "odds_ratio_loss": 7.223598480224609, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06478091329336166, + "rewards/margins": 0.01399203110486269, + "rewards/rejected": -0.07877294719219208, + "sft_loss": 0.0034678103402256966, + "step": 57 + }, + { + "epoch": 0.31813507027768256, + "grad_norm": 2.2432673044491187, + "learning_rate": 8.65274958340934e-06, + "logits/chosen": -0.7320058345794678, + "logits/rejected": -0.6547486186027527, + "logps/chosen": -0.6831153631210327, + "logps/rejected": -0.6445227265357971, + "loss": 0.7223, + "odds_ratio_loss": 7.180177688598633, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.06445227563381195, + "rewards/margins": 0.0038592619821429253, + "rewards/rejected": -0.06831153482198715, + "sft_loss": 0.004313957877457142, + "step": 58 + }, + { + "epoch": 0.3236201576962633, + "grad_norm": 2.253712626261103, + "learning_rate": 8.586269535942386e-06, + "logits/chosen": -0.7257508039474487, + "logits/rejected": -0.6928269267082214, + "logps/chosen": -0.6556897759437561, + "logps/rejected": -0.6581557393074036, + "loss": 0.7387, + "odds_ratio_loss": 7.349256992340088, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.06581557542085648, + "rewards/margins": -0.00024658930487930775, + "rewards/rejected": -0.06556899100542068, + "sft_loss": 0.003765275003388524, + "step": 59 + }, + { + "epoch": 0.329105245114844, + "grad_norm": 2.2071752316866995, + "learning_rate": 8.518457335743927e-06, + "logits/chosen": -0.679678201675415, + "logits/rejected": -0.664280891418457, + "logps/chosen": -0.7143180966377258, + "logps/rejected": -0.6450636982917786, + "loss": 0.7221, + "odds_ratio_loss": 7.182507514953613, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.06450636684894562, + "rewards/margins": 0.0069254375994205475, + "rewards/rejected": -0.07143180072307587, + "sft_loss": 0.0038413461297750473, + "step": 60 + }, + { + "epoch": 0.33459033253342474, + "grad_norm": 2.592752168909039, + "learning_rate": 8.44933817227806e-06, + "logits/chosen": -0.7278729677200317, + "logits/rejected": -0.6981578469276428, + "logps/chosen": -0.7912625074386597, + "logps/rejected": -0.7018301486968994, + "loss": 0.7851, + "odds_ratio_loss": 7.783860206604004, + "rewards/accuracies": 0.3828125, + "rewards/chosen": -0.07018300890922546, + "rewards/margins": 0.008943240158259869, + "rewards/rejected": -0.0791262537240982, + "sft_loss": 0.0066674333065748215, + "step": 61 + }, + { + "epoch": 0.34007541995200546, + "grad_norm": 5.454393119042178, + "learning_rate": 8.378937720492384e-06, + "logits/chosen": -0.7215204238891602, + "logits/rejected": -0.6841076016426086, + "logps/chosen": -0.7616227865219116, + "logps/rejected": -0.6432775259017944, + "loss": 0.7209, + "odds_ratio_loss": 7.020868301391602, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.06432775408029556, + "rewards/margins": 0.0118345245718956, + "rewards/rejected": -0.07616227120161057, + "sft_loss": 0.01877610757946968, + "step": 62 + }, + { + "epoch": 0.34556050737058625, + "grad_norm": 2.0687414102448085, + "learning_rate": 8.307282131280805e-06, + "logits/chosen": -0.7056500315666199, + "logits/rejected": -0.6676835417747498, + "logps/chosen": -0.6483266353607178, + "logps/rejected": -0.626646876335144, + "loss": 0.7043, + "odds_ratio_loss": 7.007047176361084, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.062664695084095, + "rewards/margins": 0.0021679732017219067, + "rewards/rejected": -0.0648326650261879, + "sft_loss": 0.003634248161688447, + "step": 63 + }, + { + "epoch": 0.35104559478916697, + "grad_norm": 2.3475998338820365, + "learning_rate": 8.234398021769541e-06, + "logits/chosen": -0.743563175201416, + "logits/rejected": -0.7020170092582703, + "logps/chosen": -0.7933846712112427, + "logps/rejected": -0.6398557424545288, + "loss": 0.718, + "odds_ratio_loss": 7.136807441711426, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.06398558616638184, + "rewards/margins": 0.015352879650890827, + "rewards/rejected": -0.07933846116065979, + "sft_loss": 0.004295012913644314, + "step": 64 + }, + { + "epoch": 0.3565306822077477, + "grad_norm": 2.240900663083349, + "learning_rate": 8.160312465429952e-06, + "logits/chosen": -0.7589367628097534, + "logits/rejected": -0.6660048365592957, + "logps/chosen": -0.8276284337043762, + "logps/rejected": -0.6442070603370667, + "loss": 0.7207, + "odds_ratio_loss": 7.165534973144531, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.06442070752382278, + "rewards/margins": 0.018342135474085808, + "rewards/rejected": -0.08276285231113434, + "sft_loss": 0.004189603962004185, + "step": 65 + }, + { + "epoch": 0.3620157696263284, + "grad_norm": 2.2431116846423977, + "learning_rate": 8.085052982021849e-06, + "logits/chosen": -0.7091330289840698, + "logits/rejected": -0.6447920799255371, + "logps/chosen": -0.6476763486862183, + "logps/rejected": -0.6205025315284729, + "loss": 0.7019, + "odds_ratio_loss": 6.978389739990234, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06205025315284729, + "rewards/margins": 0.0027173825073987246, + "rewards/rejected": -0.06476763635873795, + "sft_loss": 0.004105943720787764, + "step": 66 + }, + { + "epoch": 0.36750085704490915, + "grad_norm": 2.4189727594993613, + "learning_rate": 8.008647527371022e-06, + "logits/chosen": -0.7610785961151123, + "logits/rejected": -0.7081894874572754, + "logps/chosen": -0.8307157754898071, + "logps/rejected": -0.6853421330451965, + "loss": 0.7664, + "odds_ratio_loss": 7.616702079772949, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.06853421032428741, + "rewards/margins": 0.014537353999912739, + "rewards/rejected": -0.08307155966758728, + "sft_loss": 0.004689488559961319, + "step": 67 + }, + { + "epoch": 0.37298594446348987, + "grad_norm": 2.1080406296134604, + "learning_rate": 7.931124482984802e-06, + "logits/chosen": -0.7000724077224731, + "logits/rejected": -0.648707926273346, + "logps/chosen": -0.7306436896324158, + "logps/rejected": -0.6893948912620544, + "loss": 0.7721, + "odds_ratio_loss": 7.686524391174316, + "rewards/accuracies": 0.359375, + "rewards/chosen": -0.06893948465585709, + "rewards/margins": 0.004124888684600592, + "rewards/rejected": -0.07306437194347382, + "sft_loss": 0.0034583115484565496, + "step": 68 + }, + { + "epoch": 0.3784710318820706, + "grad_norm": 2.279048108347312, + "learning_rate": 7.85251264550948e-06, + "logits/chosen": -0.7173571586608887, + "logits/rejected": -0.6854091882705688, + "logps/chosen": -0.8085934519767761, + "logps/rejected": -0.6753490567207336, + "loss": 0.7495, + "odds_ratio_loss": 7.4520111083984375, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.067534901201725, + "rewards/margins": 0.013324443250894547, + "rewards/rejected": -0.08085935562849045, + "sft_loss": 0.004327339120209217, + "step": 69 + }, + { + "epoch": 0.3839561193006514, + "grad_norm": 2.2929493653439623, + "learning_rate": 7.772841216033534e-06, + "logits/chosen": -0.7415894865989685, + "logits/rejected": -0.7029559016227722, + "logps/chosen": -0.7581257224082947, + "logps/rejected": -0.6466966867446899, + "loss": 0.7216, + "odds_ratio_loss": 7.17643404006958, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.06466967612504959, + "rewards/margins": 0.011142895556986332, + "rewards/rejected": -0.07581256330013275, + "sft_loss": 0.003946883603930473, + "step": 70 + }, + { + "epoch": 0.3894412067192321, + "grad_norm": 2.4115483219556104, + "learning_rate": 7.692139789240611e-06, + "logits/chosen": -0.7181853652000427, + "logits/rejected": -0.7305399775505066, + "logps/chosen": -0.5790694952011108, + "logps/rejected": -0.6291408538818359, + "loss": 0.7136, + "odds_ratio_loss": 7.071855068206787, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06291408091783524, + "rewards/margins": -0.005007130093872547, + "rewards/rejected": -0.057906948029994965, + "sft_loss": 0.00638266047462821, + "step": 71 + }, + { + "epoch": 0.39492629413781283, + "grad_norm": 2.585528390215209, + "learning_rate": 7.61043834241632e-06, + "logits/chosen": -0.7213830947875977, + "logits/rejected": -0.6931593418121338, + "logps/chosen": -0.6970454454421997, + "logps/rejected": -0.6629546880722046, + "loss": 0.7414, + "odds_ratio_loss": 7.3666510581970215, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.06629546731710434, + "rewards/margins": 0.003409074852243066, + "rewards/rejected": -0.06970454007387161, + "sft_loss": 0.004775386769324541, + "step": 72 + }, + { + "epoch": 0.40041138155639355, + "grad_norm": 2.5206082590938657, + "learning_rate": 7.527767224312883e-06, + "logits/chosen": -0.6767313480377197, + "logits/rejected": -0.6348490118980408, + "logps/chosen": -0.7730945348739624, + "logps/rejected": -0.6511149406433105, + "loss": 0.7309, + "odds_ratio_loss": 7.25616455078125, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.06511148810386658, + "rewards/margins": 0.012197963893413544, + "rewards/rejected": -0.07730945199728012, + "sft_loss": 0.005264699459075928, + "step": 73 + }, + { + "epoch": 0.4058964689749743, + "grad_norm": 2.342735729722048, + "learning_rate": 7.44415714387582e-06, + "logits/chosen": -0.7068739533424377, + "logits/rejected": -0.6443241834640503, + "logps/chosen": -0.6506850123405457, + "logps/rejected": -0.660474419593811, + "loss": 0.7423, + "odds_ratio_loss": 7.371833801269531, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06604744493961334, + "rewards/margins": -0.0009789422620087862, + "rewards/rejected": -0.06506850570440292, + "sft_loss": 0.005163977388292551, + "step": 74 + }, + { + "epoch": 0.411381556393555, + "grad_norm": 2.244946483469262, + "learning_rate": 7.359639158836828e-06, + "logits/chosen": -0.6814281940460205, + "logits/rejected": -0.6734283566474915, + "logps/chosen": -0.7911446690559387, + "logps/rejected": -0.6680716276168823, + "loss": 0.7437, + "odds_ratio_loss": 7.399670124053955, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06680716574192047, + "rewards/margins": 0.012307303957641125, + "rewards/rejected": -0.07911446690559387, + "sft_loss": 0.0037362282164394855, + "step": 75 + }, + { + "epoch": 0.41686664381213573, + "grad_norm": 1.9776378551597844, + "learning_rate": 7.2742446641770985e-06, + "logits/chosen": -0.6895711421966553, + "logits/rejected": -0.6461458802223206, + "logps/chosen": -0.7058255076408386, + "logps/rejected": -0.6571383476257324, + "loss": 0.7396, + "odds_ratio_loss": 7.361542224884033, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.06571383774280548, + "rewards/margins": 0.004868712741881609, + "rewards/rejected": -0.0705825537443161, + "sft_loss": 0.0034039851743727922, + "step": 76 + }, + { + "epoch": 0.4223517312307165, + "grad_norm": 2.5525880372678342, + "learning_rate": 7.188005380465365e-06, + "logits/chosen": -0.672836184501648, + "logits/rejected": -0.6136391162872314, + "logps/chosen": -0.7278799414634705, + "logps/rejected": -0.6120200753211975, + "loss": 0.6869, + "odds_ratio_loss": 6.825428485870361, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.061201997101306915, + "rewards/margins": 0.011585984379053116, + "rewards/rejected": -0.07278798520565033, + "sft_loss": 0.004372562747448683, + "step": 77 + }, + { + "epoch": 0.42783681864929723, + "grad_norm": 2.2583507815548707, + "learning_rate": 7.10095334207501e-06, + "logits/chosen": -0.6909606456756592, + "logits/rejected": -0.6990565061569214, + "logps/chosen": -0.8577082753181458, + "logps/rejected": -0.6745601296424866, + "loss": 0.7559, + "odds_ratio_loss": 7.520651340484619, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06745601445436478, + "rewards/margins": 0.018314823508262634, + "rewards/rejected": -0.08577083051204681, + "sft_loss": 0.0038547185249626637, + "step": 78 + }, + { + "epoch": 0.43332190606787796, + "grad_norm": 2.1348804607525684, + "learning_rate": 7.013120885284599e-06, + "logits/chosen": -0.8005620241165161, + "logits/rejected": -0.745074987411499, + "logps/chosen": -0.7438629865646362, + "logps/rejected": -0.6111059784889221, + "loss": 0.6881, + "odds_ratio_loss": 6.849056720733643, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06111060082912445, + "rewards/margins": 0.013275692239403725, + "rewards/rejected": -0.07438629865646362, + "sft_loss": 0.0032314579002559185, + "step": 79 + }, + { + "epoch": 0.4388069934864587, + "grad_norm": 2.2851255790088536, + "learning_rate": 6.924540636266272e-06, + "logits/chosen": -0.8250284194946289, + "logits/rejected": -0.7828257083892822, + "logps/chosen": -0.6604968905448914, + "logps/rejected": -0.6587929129600525, + "loss": 0.7421, + "odds_ratio_loss": 7.378921985626221, + "rewards/accuracies": 0.3515625, + "rewards/chosen": -0.06587929278612137, + "rewards/margins": 0.0001703993184491992, + "rewards/rejected": -0.06604968756437302, + "sft_loss": 0.004175678826868534, + "step": 80 + }, + { + "epoch": 0.4442920809050394, + "grad_norm": 2.2595985001445165, + "learning_rate": 6.835245498966461e-06, + "logits/chosen": -0.7141305208206177, + "logits/rejected": -0.6828845739364624, + "logps/chosen": -0.8394588828086853, + "logps/rejected": -0.6575326919555664, + "loss": 0.7341, + "odds_ratio_loss": 7.295228958129883, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.0657532662153244, + "rewards/margins": 0.018192624673247337, + "rewards/rejected": -0.08394590020179749, + "sft_loss": 0.004541095811873674, + "step": 81 + }, + { + "epoch": 0.44977716832362014, + "grad_norm": 2.2335414228759576, + "learning_rate": 6.7452686428834045e-06, + "logits/chosen": -0.7194411754608154, + "logits/rejected": -0.6863387227058411, + "logps/chosen": -0.8470317721366882, + "logps/rejected": -0.6532705426216125, + "loss": 0.73, + "odds_ratio_loss": 7.253733158111572, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.06532706320285797, + "rewards/margins": 0.019376114010810852, + "rewards/rejected": -0.08470318466424942, + "sft_loss": 0.00458897277712822, + "step": 82 + }, + { + "epoch": 0.4552622557422009, + "grad_norm": 2.054108081993998, + "learning_rate": 6.654643490746042e-06, + "logits/chosen": -0.7527655959129333, + "logits/rejected": -0.7424032092094421, + "logps/chosen": -0.676724374294281, + "logps/rejected": -0.6338033080101013, + "loss": 0.7136, + "odds_ratio_loss": 7.100609302520752, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06338033080101013, + "rewards/margins": 0.004292105324566364, + "rewards/rejected": -0.06767243146896362, + "sft_loss": 0.003586169332265854, + "step": 83 + }, + { + "epoch": 0.46074734316078164, + "grad_norm": 2.0252693365171623, + "learning_rate": 6.563403706098833e-06, + "logits/chosen": -0.7776511907577515, + "logits/rejected": -0.7408439517021179, + "logps/chosen": -0.7689855098724365, + "logps/rejected": -0.6419221758842468, + "loss": 0.7196, + "odds_ratio_loss": 7.161562919616699, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.06419222801923752, + "rewards/margins": 0.012706330046057701, + "rewards/rejected": -0.07689855247735977, + "sft_loss": 0.0034365493338555098, + "step": 84 + }, + { + "epoch": 0.46623243057936237, + "grad_norm": 2.405711155989815, + "learning_rate": 6.471583180797121e-06, + "logits/chosen": -0.7478362321853638, + "logits/rejected": -0.7417604327201843, + "logps/chosen": -0.8892000317573547, + "logps/rejected": -0.6463872790336609, + "loss": 0.726, + "odds_ratio_loss": 7.2149338722229, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.06463872641324997, + "rewards/margins": 0.024281270802021027, + "rewards/rejected": -0.088919997215271, + "sft_loss": 0.004515405744314194, + "step": 85 + }, + { + "epoch": 0.4717175179979431, + "grad_norm": 2.327436937255859, + "learning_rate": 6.379216022417695e-06, + "logits/chosen": -0.713384211063385, + "logits/rejected": -0.7074405550956726, + "logps/chosen": -0.8071316480636597, + "logps/rejected": -0.6722535490989685, + "loss": 0.7491, + "odds_ratio_loss": 7.439596652984619, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06722535192966461, + "rewards/margins": 0.013487813994288445, + "rewards/rejected": -0.0807131677865982, + "sft_loss": 0.005136145744472742, + "step": 86 + }, + { + "epoch": 0.4772026054165238, + "grad_norm": 4.597101480060567, + "learning_rate": 6.286336541589224e-06, + "logits/chosen": -0.8244719505310059, + "logits/rejected": -0.793204128742218, + "logps/chosen": -0.9169348478317261, + "logps/rejected": -0.6403506398200989, + "loss": 0.7141, + "odds_ratio_loss": 6.974415302276611, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.06403505802154541, + "rewards/margins": 0.027658430859446526, + "rewards/rejected": -0.09169349074363708, + "sft_loss": 0.0166785828769207, + "step": 87 + }, + { + "epoch": 0.48268769283510454, + "grad_norm": 2.98534441586508, + "learning_rate": 6.192979239247243e-06, + "logits/chosen": -0.7573586106300354, + "logits/rejected": -0.7331765294075012, + "logps/chosen": -0.7313320636749268, + "logps/rejected": -0.6538473963737488, + "loss": 0.7336, + "odds_ratio_loss": 7.25970983505249, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06538473814725876, + "rewards/margins": 0.007748466916382313, + "rewards/rejected": -0.0731332078576088, + "sft_loss": 0.007589638698846102, + "step": 88 + }, + { + "epoch": 0.48817278025368527, + "grad_norm": 3.060607573744663, + "learning_rate": 6.099178793818479e-06, + "logits/chosen": -0.8188400268554688, + "logits/rejected": -0.7003345489501953, + "logps/chosen": -0.7988659143447876, + "logps/rejected": -0.6332603096961975, + "loss": 0.7111, + "odds_ratio_loss": 7.054601192474365, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.06332603842020035, + "rewards/margins": 0.01656055450439453, + "rewards/rejected": -0.07988659292459488, + "sft_loss": 0.005684853531420231, + "step": 89 + }, + { + "epoch": 0.49365786767226605, + "grad_norm": 3.426360021441316, + "learning_rate": 6.0049700483392256e-06, + "logits/chosen": -0.7684093713760376, + "logits/rejected": -0.7432892322540283, + "logps/chosen": -0.6437249183654785, + "logps/rejected": -0.6601754426956177, + "loss": 0.7411, + "odds_ratio_loss": 7.306463241577148, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06601753830909729, + "rewards/margins": -0.0016450518742203712, + "rewards/rejected": -0.06437249481678009, + "sft_loss": 0.01042965892702341, + "step": 90 + }, + { + "epoch": 0.4991429550908468, + "grad_norm": 2.462338863296407, + "learning_rate": 5.910387997512573e-06, + "logits/chosen": -0.7122192978858948, + "logits/rejected": -0.7019898295402527, + "logps/chosen": -0.7473130822181702, + "logps/rejected": -0.68088299036026, + "loss": 0.7631, + "odds_ratio_loss": 7.561764240264893, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06808829307556152, + "rewards/margins": 0.006643014028668404, + "rewards/rejected": -0.07473130524158478, + "sft_loss": 0.0068762716837227345, + "step": 91 + }, + { + "epoch": 0.5046280425094275, + "grad_norm": 2.0658511683882184, + "learning_rate": 5.815467774709314e-06, + "logits/chosen": -0.8243841528892517, + "logits/rejected": -0.7515370845794678, + "logps/chosen": -0.8632618188858032, + "logps/rejected": -0.6042035222053528, + "loss": 0.6749, + "odds_ratio_loss": 6.714731693267822, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.06042035296559334, + "rewards/margins": 0.025905832648277283, + "rewards/rejected": -0.08632618188858032, + "sft_loss": 0.0033876379020512104, + "step": 92 + }, + { + "epoch": 0.5101131299280083, + "grad_norm": 2.387632371765397, + "learning_rate": 5.7202446389173225e-06, + "logits/chosen": -0.7121865749359131, + "logits/rejected": -0.6870692372322083, + "logps/chosen": -0.8353071212768555, + "logps/rejected": -0.6251768469810486, + "loss": 0.6945, + "odds_ratio_loss": 6.89461612701416, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.0625176802277565, + "rewards/margins": 0.021013032644987106, + "rewards/rejected": -0.0835307165980339, + "sft_loss": 0.0050766305066645145, + "step": 93 + }, + { + "epoch": 0.515598217346589, + "grad_norm": 2.4212503643030865, + "learning_rate": 5.624753961644281e-06, + "logits/chosen": -0.7463563084602356, + "logits/rejected": -0.7374889850616455, + "logps/chosen": -0.6804830431938171, + "logps/rejected": -0.6773659586906433, + "loss": 0.7635, + "odds_ratio_loss": 7.582502841949463, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.06773659586906433, + "rewards/margins": 0.00031171052251011133, + "rewards/rejected": -0.06804830580949783, + "sft_loss": 0.005270948633551598, + "step": 94 + }, + { + "epoch": 0.5210833047651697, + "grad_norm": 2.1125220078129914, + "learning_rate": 5.529031213778615e-06, + "logits/chosen": -0.7246003150939941, + "logits/rejected": -0.7458670139312744, + "logps/chosen": -0.6863371133804321, + "logps/rejected": -0.642282247543335, + "loss": 0.7268, + "odds_ratio_loss": 7.228869438171387, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06422823667526245, + "rewards/margins": 0.0044054812751710415, + "rewards/rejected": -0.06863371282815933, + "sft_loss": 0.0038752383552491665, + "step": 95 + }, + { + "epoch": 0.5265683921837504, + "grad_norm": 1.9888533547489935, + "learning_rate": 5.433111952413496e-06, + "logits/chosen": -0.8252724409103394, + "logits/rejected": -0.7656226754188538, + "logps/chosen": -0.698495090007782, + "logps/rejected": -0.6140046715736389, + "loss": 0.6895, + "odds_ratio_loss": 6.8552350997924805, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.06140046939253807, + "rewards/margins": 0.008449044078588486, + "rewards/rejected": -0.06984951347112656, + "sft_loss": 0.003965131007134914, + "step": 96 + }, + { + "epoch": 0.5320534796023312, + "grad_norm": 2.5827652079178165, + "learning_rate": 5.3370318076388405e-06, + "logits/chosen": -0.7367005348205566, + "logits/rejected": -0.7038895487785339, + "logps/chosen": -0.8228276968002319, + "logps/rejected": -0.6827890872955322, + "loss": 0.7607, + "odds_ratio_loss": 7.550937175750732, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06827891618013382, + "rewards/margins": 0.014003856107592583, + "rewards/rejected": -0.08228276669979095, + "sft_loss": 0.0056315576657652855, + "step": 97 + }, + { + "epoch": 0.5375385670209119, + "grad_norm": 2.0893113762601585, + "learning_rate": 5.240826469306187e-06, + "logits/chosen": -0.8139528632164001, + "logits/rejected": -0.7607035636901855, + "logps/chosen": -0.889180064201355, + "logps/rejected": -0.6422348618507385, + "loss": 0.7188, + "odds_ratio_loss": 7.150365829467773, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.06422348320484161, + "rewards/margins": 0.0246945321559906, + "rewards/rejected": -0.08891801536083221, + "sft_loss": 0.003753484459593892, + "step": 98 + }, + { + "epoch": 0.5430236544394926, + "grad_norm": 2.5109300003185537, + "learning_rate": 5.144531673771364e-06, + "logits/chosen": -0.8394501209259033, + "logits/rejected": -0.7594123482704163, + "logps/chosen": -1.1132104396820068, + "logps/rejected": -0.678527295589447, + "loss": 0.7596, + "odds_ratio_loss": 7.536545753479004, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06785273551940918, + "rewards/margins": 0.04346831887960434, + "rewards/rejected": -0.11132105439901352, + "sft_loss": 0.005897625349462032, + "step": 99 + }, + { + "epoch": 0.5485087418580734, + "grad_norm": 2.363027336814075, + "learning_rate": 5.048183190619904e-06, + "logits/chosen": -0.776432454586029, + "logits/rejected": -0.7214481830596924, + "logps/chosen": -0.8866347074508667, + "logps/rejected": -0.6367273330688477, + "loss": 0.7085, + "odds_ratio_loss": 7.037672996520996, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06367272883653641, + "rewards/margins": 0.024990737438201904, + "rewards/rejected": -0.08866347372531891, + "sft_loss": 0.004776695277541876, + "step": 100 + }, + { + "epoch": 0.5539938292766541, + "grad_norm": 1.935622563669197, + "learning_rate": 4.951816809380098e-06, + "logits/chosen": -0.7725133895874023, + "logits/rejected": -0.7367417812347412, + "logps/chosen": -0.7350077629089355, + "logps/rejected": -0.5911646485328674, + "loss": 0.6656, + "odds_ratio_loss": 6.6231279373168945, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.059116464108228683, + "rewards/margins": 0.014384310692548752, + "rewards/rejected": -0.07350076735019684, + "sft_loss": 0.003327850252389908, + "step": 101 + }, + { + "epoch": 0.5594789166952349, + "grad_norm": 2.2992228974600817, + "learning_rate": 4.855468326228638e-06, + "logits/chosen": -0.8631143569946289, + "logits/rejected": -0.8134572505950928, + "logps/chosen": -0.7315545082092285, + "logps/rejected": -0.6632150411605835, + "loss": 0.7458, + "odds_ratio_loss": 7.4111127853393555, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.06632150709629059, + "rewards/margins": 0.006833940744400024, + "rewards/rejected": -0.07315544784069061, + "sft_loss": 0.004706574138253927, + "step": 102 + }, + { + "epoch": 0.5649640041138155, + "grad_norm": 2.0889788103776255, + "learning_rate": 4.7591735306938144e-06, + "logits/chosen": -0.790824294090271, + "logits/rejected": -0.7726211547851562, + "logps/chosen": -0.6989539861679077, + "logps/rejected": -0.6077806949615479, + "loss": 0.6848, + "odds_ratio_loss": 6.813911437988281, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.06077807396650314, + "rewards/margins": 0.00911732017993927, + "rewards/rejected": -0.06989538669586182, + "sft_loss": 0.0033795726485550404, + "step": 103 + }, + { + "epoch": 0.5704490915323963, + "grad_norm": 2.0996097283080295, + "learning_rate": 4.662968192361161e-06, + "logits/chosen": -0.8079996705055237, + "logits/rejected": -0.7932396531105042, + "logps/chosen": -0.7709510326385498, + "logps/rejected": -0.6129697561264038, + "loss": 0.6869, + "odds_ratio_loss": 6.833165645599365, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.0612969771027565, + "rewards/margins": 0.015798134729266167, + "rewards/rejected": -0.07709510624408722, + "sft_loss": 0.0036084074527025223, + "step": 104 + }, + { + "epoch": 0.575934178950977, + "grad_norm": 2.392066071877973, + "learning_rate": 4.5668880475865074e-06, + "logits/chosen": -0.8237022161483765, + "logits/rejected": -0.7995460629463196, + "logps/chosen": -0.6559365391731262, + "logps/rejected": -0.5953893065452576, + "loss": 0.6716, + "odds_ratio_loss": 6.677926063537598, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.05953892320394516, + "rewards/margins": 0.006054719910025597, + "rewards/rejected": -0.0655936449766159, + "sft_loss": 0.003847929183393717, + "step": 105 + }, + { + "epoch": 0.5814192663695578, + "grad_norm": 2.2088045238777636, + "learning_rate": 4.4709687862213866e-06, + "logits/chosen": -0.7528365254402161, + "logits/rejected": -0.6947425603866577, + "logps/chosen": -0.9411884546279907, + "logps/rejected": -0.6525102853775024, + "loss": 0.7281, + "odds_ratio_loss": 7.2357940673828125, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.06525103747844696, + "rewards/margins": 0.028867818415164948, + "rewards/rejected": -0.0941188633441925, + "sft_loss": 0.0045068394392728806, + "step": 106 + }, + { + "epoch": 0.5869043537881385, + "grad_norm": 3.1924453094544942, + "learning_rate": 4.3752460383557195e-06, + "logits/chosen": -0.8217104077339172, + "logits/rejected": -0.7772889733314514, + "logps/chosen": -0.7546268701553345, + "logps/rejected": -0.6607457399368286, + "loss": 0.7412, + "odds_ratio_loss": 7.280969619750977, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06607457995414734, + "rewards/margins": 0.009388094767928123, + "rewards/rejected": -0.07546267658472061, + "sft_loss": 0.013106751255691051, + "step": 107 + }, + { + "epoch": 0.5923894412067192, + "grad_norm": 2.246014553224009, + "learning_rate": 4.27975536108268e-06, + "logits/chosen": -0.7625182867050171, + "logits/rejected": -0.7424900531768799, + "logps/chosen": -0.7097453474998474, + "logps/rejected": -0.5825679302215576, + "loss": 0.6546, + "odds_ratio_loss": 6.507444381713867, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.05825679004192352, + "rewards/margins": 0.012717743404209614, + "rewards/rejected": -0.07097454369068146, + "sft_loss": 0.003896431066095829, + "step": 108 + }, + { + "epoch": 0.5978745286253, + "grad_norm": 2.107300230600946, + "learning_rate": 4.184532225290687e-06, + "logits/chosen": -0.7371281385421753, + "logits/rejected": -0.7014196515083313, + "logps/chosen": -0.9409867525100708, + "logps/rejected": -0.6328235268592834, + "loss": 0.7067, + "odds_ratio_loss": 7.03104305267334, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -0.06328234821557999, + "rewards/margins": 0.030816327780485153, + "rewards/rejected": -0.09409867972135544, + "sft_loss": 0.003552825888618827, + "step": 109 + }, + { + "epoch": 0.6033596160438807, + "grad_norm": 2.32750784917777, + "learning_rate": 4.089612002487428e-06, + "logits/chosen": -0.7617780566215515, + "logits/rejected": -0.6975070834159851, + "logps/chosen": -0.8203774690628052, + "logps/rejected": -0.616916298866272, + "loss": 0.6908, + "odds_ratio_loss": 6.856823921203613, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.061691634356975555, + "rewards/margins": 0.020346110686659813, + "rewards/rejected": -0.08203774690628052, + "sft_loss": 0.005122015718370676, + "step": 110 + }, + { + "epoch": 0.6088447034624614, + "grad_norm": 3.687888221085956, + "learning_rate": 3.995029951660777e-06, + "logits/chosen": -0.7304787039756775, + "logits/rejected": -0.6660119295120239, + "logps/chosen": -0.8241205215454102, + "logps/rejected": -0.684959888458252, + "loss": 0.7648, + "odds_ratio_loss": 7.607235908508301, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.0684959888458252, + "rewards/margins": 0.01391606591641903, + "rewards/rejected": -0.08241204917430878, + "sft_loss": 0.004041440784931183, + "step": 111 + }, + { + "epoch": 0.6143297908810421, + "grad_norm": 3.815048531172436, + "learning_rate": 3.900821206181521e-06, + "logits/chosen": -0.7075001001358032, + "logits/rejected": -0.7053488492965698, + "logps/chosen": -0.7641057372093201, + "logps/rejected": -0.6342787146568298, + "loss": 0.7087, + "odds_ratio_loss": 7.047255516052246, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.0634278729557991, + "rewards/margins": 0.012982700951397419, + "rewards/rejected": -0.07641058415174484, + "sft_loss": 0.0039539458230137825, + "step": 112 + }, + { + "epoch": 0.6198148782996229, + "grad_norm": 2.6437128083822654, + "learning_rate": 3.8070207607527587e-06, + "logits/chosen": -0.751364529132843, + "logits/rejected": -0.7497612237930298, + "logps/chosen": -0.707420289516449, + "logps/rejected": -0.6882792711257935, + "loss": 0.7676, + "odds_ratio_loss": 7.612734794616699, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06882793456315994, + "rewards/margins": 0.001914096181280911, + "rewards/rejected": -0.07074202597141266, + "sft_loss": 0.0063278162851929665, + "step": 113 + }, + { + "epoch": 0.6252999657182037, + "grad_norm": 2.249087144528014, + "learning_rate": 3.7136634584107787e-06, + "logits/chosen": -0.7711942791938782, + "logits/rejected": -0.7532557249069214, + "logps/chosen": -0.6726529598236084, + "logps/rejected": -0.6081224679946899, + "loss": 0.6863, + "odds_ratio_loss": 6.813708305358887, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.060812242329120636, + "rewards/margins": 0.0064530507661402225, + "rewards/rejected": -0.06726529449224472, + "sft_loss": 0.004954230505973101, + "step": 114 + }, + { + "epoch": 0.6307850531367843, + "grad_norm": 2.235141918994794, + "learning_rate": 3.620783977582305e-06, + "logits/chosen": -0.7548055648803711, + "logits/rejected": -0.7235039472579956, + "logps/chosen": -0.7916484475135803, + "logps/rejected": -0.6184461116790771, + "loss": 0.6943, + "odds_ratio_loss": 6.894530773162842, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.061844613403081894, + "rewards/margins": 0.017320234328508377, + "rewards/rejected": -0.07916485518217087, + "sft_loss": 0.004836176987737417, + "step": 115 + }, + { + "epoch": 0.6362701405553651, + "grad_norm": 2.1938466749292114, + "learning_rate": 3.528416819202881e-06, + "logits/chosen": -0.7192294597625732, + "logits/rejected": -0.695332944393158, + "logps/chosen": -0.7696835994720459, + "logps/rejected": -0.6214407086372375, + "loss": 0.7016, + "odds_ratio_loss": 6.972049236297607, + "rewards/accuracies": 0.390625, + "rewards/chosen": -0.06214407458901405, + "rewards/margins": 0.014824289828538895, + "rewards/rejected": -0.07696835696697235, + "sft_loss": 0.004408481530845165, + "step": 116 + }, + { + "epoch": 0.6417552279739458, + "grad_norm": 3.6541286655490524, + "learning_rate": 3.43659629390117e-06, + "logits/chosen": -0.7324870824813843, + "logits/rejected": -0.688734769821167, + "logps/chosen": -0.7166416645050049, + "logps/rejected": -0.6708381175994873, + "loss": 0.7539, + "odds_ratio_loss": 7.414391040802002, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.06708382070064545, + "rewards/margins": 0.004580352455377579, + "rewards/rejected": -0.07166417688131332, + "sft_loss": 0.01244704145938158, + "step": 117 + }, + { + "epoch": 0.6472403153925266, + "grad_norm": 2.2808659065754777, + "learning_rate": 3.3453565092539586e-06, + "logits/chosen": -0.7768323421478271, + "logits/rejected": -0.7245185971260071, + "logps/chosen": -0.7158573269844055, + "logps/rejected": -0.616123616695404, + "loss": 0.694, + "odds_ratio_loss": 6.893760681152344, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.06161235645413399, + "rewards/margins": 0.009973364882171154, + "rewards/rejected": -0.07158571481704712, + "sft_loss": 0.0046532335691154, + "step": 118 + }, + { + "epoch": 0.6527254028111072, + "grad_norm": 2.0368022051271844, + "learning_rate": 3.254731357116597e-06, + "logits/chosen": -0.7800723910331726, + "logits/rejected": -0.725759744644165, + "logps/chosen": -0.7847654819488525, + "logps/rejected": -0.6137481331825256, + "loss": 0.6878, + "odds_ratio_loss": 6.837745189666748, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06137481704354286, + "rewards/margins": 0.017101740464568138, + "rewards/rejected": -0.07847655564546585, + "sft_loss": 0.003991706296801567, + "step": 119 + }, + { + "epoch": 0.658210490229688, + "grad_norm": 1.9900402701993793, + "learning_rate": 3.16475450103354e-06, + "logits/chosen": -0.7510137557983398, + "logits/rejected": -0.7143795490264893, + "logps/chosen": -0.6110856533050537, + "logps/rejected": -0.5404046773910522, + "loss": 0.614, + "odds_ratio_loss": 6.107670783996582, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05404047295451164, + "rewards/margins": 0.007068089675158262, + "rewards/rejected": -0.061108559370040894, + "sft_loss": 0.003193741664290428, + "step": 120 + }, + { + "epoch": 0.6636955776482688, + "grad_norm": 1.9532423296861858, + "learning_rate": 3.0754593637337276e-06, + "logits/chosen": -0.7384210824966431, + "logits/rejected": -0.7613973021507263, + "logps/chosen": -0.5634211897850037, + "logps/rejected": -0.6090326309204102, + "loss": 0.6938, + "odds_ratio_loss": 6.90548038482666, + "rewards/accuracies": 0.3828125, + "rewards/chosen": -0.06090325862169266, + "rewards/margins": -0.004561137408018112, + "rewards/rejected": -0.056342121213674545, + "sft_loss": 0.003293172223493457, + "step": 121 + }, + { + "epoch": 0.6691806650668495, + "grad_norm": 2.314587585640712, + "learning_rate": 2.986879114715403e-06, + "logits/chosen": -0.7613508701324463, + "logits/rejected": -0.7241955995559692, + "logps/chosen": -0.6129331588745117, + "logps/rejected": -0.6036895513534546, + "loss": 0.6794, + "odds_ratio_loss": 6.749566555023193, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06036894768476486, + "rewards/margins": 0.0009243742097169161, + "rewards/rejected": -0.06129331886768341, + "sft_loss": 0.004476083908230066, + "step": 122 + }, + { + "epoch": 0.6746657524854303, + "grad_norm": 1.9868042893647633, + "learning_rate": 2.899046657924992e-06, + "logits/chosen": -0.721281886100769, + "logits/rejected": -0.6971719264984131, + "logps/chosen": -0.6944752931594849, + "logps/rejected": -0.609091579914093, + "loss": 0.6844, + "odds_ratio_loss": 6.81266975402832, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06090915948152542, + "rewards/margins": 0.008538373745977879, + "rewards/rejected": -0.06944753229618073, + "sft_loss": 0.003112279111519456, + "step": 123 + }, + { + "epoch": 0.6801508399040109, + "grad_norm": 2.489835822157132, + "learning_rate": 2.8119946195346375e-06, + "logits/chosen": -0.8069707155227661, + "logits/rejected": -0.7617440223693848, + "logps/chosen": -0.7581583857536316, + "logps/rejected": -0.6261847019195557, + "loss": 0.6998, + "odds_ratio_loss": 6.9503560066223145, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.06261847168207169, + "rewards/margins": 0.013197370804846287, + "rewards/rejected": -0.0758158341050148, + "sft_loss": 0.004782428033649921, + "step": 124 + }, + { + "epoch": 0.6856359273225917, + "grad_norm": 2.1133326129430134, + "learning_rate": 2.725755335822903e-06, + "logits/chosen": -0.7975918054580688, + "logits/rejected": -0.7674388885498047, + "logps/chosen": -0.9080885648727417, + "logps/rejected": -0.6341651678085327, + "loss": 0.7058, + "odds_ratio_loss": 7.018553256988525, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.0634165108203888, + "rewards/margins": 0.027392340824007988, + "rewards/rejected": -0.09080885350704193, + "sft_loss": 0.003976397681981325, + "step": 125 + }, + { + "epoch": 0.6911210147411725, + "grad_norm": 2.858149017181916, + "learning_rate": 2.6403608411631744e-06, + "logits/chosen": -0.7425932288169861, + "logits/rejected": -0.7346398234367371, + "logps/chosen": -0.7898511290550232, + "logps/rejected": -0.6491058468818665, + "loss": 0.735, + "odds_ratio_loss": 7.295699119567871, + "rewards/accuracies": 0.3671875, + "rewards/chosen": -0.06491059064865112, + "rewards/margins": 0.014074533246457577, + "rewards/rejected": -0.07898511737585068, + "sft_loss": 0.005448752082884312, + "step": 126 + }, + { + "epoch": 0.6966061021597532, + "grad_norm": 2.150148574063965, + "learning_rate": 2.555842856124182e-06, + "logits/chosen": -0.7330852150917053, + "logits/rejected": -0.6947073340415955, + "logps/chosen": -0.7349799871444702, + "logps/rejected": -0.6045773029327393, + "loss": 0.678, + "odds_ratio_loss": 6.737186908721924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.060457728803157806, + "rewards/margins": 0.013040272518992424, + "rewards/rejected": -0.07349800318479538, + "sft_loss": 0.004280973691493273, + "step": 127 + }, + { + "epoch": 0.7020911895783339, + "grad_norm": 2.0636310647720912, + "learning_rate": 2.472232775687119e-06, + "logits/chosen": -0.7301775217056274, + "logits/rejected": -0.6499804258346558, + "logps/chosen": -0.7413673400878906, + "logps/rejected": -0.6203609108924866, + "loss": 0.6958, + "odds_ratio_loss": 6.925926208496094, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06203608959913254, + "rewards/margins": 0.012100642547011375, + "rewards/rejected": -0.07413673400878906, + "sft_loss": 0.0032169800251722336, + "step": 128 + }, + { + "epoch": 0.7075762769969146, + "grad_norm": 2.458823451021782, + "learning_rate": 2.389561657583681e-06, + "logits/chosen": -0.738516092300415, + "logits/rejected": -0.6830987334251404, + "logps/chosen": -0.9752374887466431, + "logps/rejected": -0.6493728756904602, + "loss": 0.7323, + "odds_ratio_loss": 7.261032581329346, + "rewards/accuracies": 0.3828125, + "rewards/chosen": -0.0649372935295105, + "rewards/margins": 0.03258645534515381, + "rewards/rejected": -0.09752374142408371, + "sft_loss": 0.006236766930669546, + "step": 129 + }, + { + "epoch": 0.7130613644154954, + "grad_norm": 2.4560062596870877, + "learning_rate": 2.30786021075939e-06, + "logits/chosen": -0.6550124883651733, + "logits/rejected": -0.7046679258346558, + "logps/chosen": -0.6320035457611084, + "logps/rejected": -0.6258726119995117, + "loss": 0.7064, + "odds_ratio_loss": 7.010786533355713, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.06258726119995117, + "rewards/margins": 0.0006130932597443461, + "rewards/rejected": -0.06320035457611084, + "sft_loss": 0.005346982274204493, + "step": 130 + }, + { + "epoch": 0.7185464518340761, + "grad_norm": 2.5029419648282083, + "learning_rate": 2.2271587839664673e-06, + "logits/chosen": -0.688187837600708, + "logits/rejected": -0.6578387022018433, + "logps/chosen": -0.8063658475875854, + "logps/rejected": -0.6298286318778992, + "loss": 0.7054, + "odds_ratio_loss": 6.999924182891846, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06298286467790604, + "rewards/margins": 0.01765371486544609, + "rewards/rejected": -0.08063658326864243, + "sft_loss": 0.005409896373748779, + "step": 131 + }, + { + "epoch": 0.7240315392526568, + "grad_norm": 3.1547535885726976, + "learning_rate": 2.1474873544905204e-06, + "logits/chosen": -0.6956002712249756, + "logits/rejected": -0.6422963738441467, + "logps/chosen": -0.7219193577766418, + "logps/rejected": -0.6385369300842285, + "loss": 0.7147, + "odds_ratio_loss": 7.056730270385742, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.06385369598865509, + "rewards/margins": 0.008338242769241333, + "rewards/rejected": -0.07219193875789642, + "sft_loss": 0.008979414589703083, + "step": 132 + }, + { + "epoch": 0.7295166266712376, + "grad_norm": 2.0510077565138185, + "learning_rate": 2.0688755170152e-06, + "logits/chosen": -0.6617864370346069, + "logits/rejected": -0.6148036122322083, + "logps/chosen": -0.6251884698867798, + "logps/rejected": -0.5985109210014343, + "loss": 0.6755, + "odds_ratio_loss": 6.721668243408203, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.05985109508037567, + "rewards/margins": 0.0026677525602281094, + "rewards/rejected": -0.06251884996891022, + "sft_loss": 0.003360216738656163, + "step": 133 + }, + { + "epoch": 0.7350017140898183, + "grad_norm": 2.2969756511886286, + "learning_rate": 1.9913524726289784e-06, + "logits/chosen": -0.6497606039047241, + "logits/rejected": -0.6395284533500671, + "logps/chosen": -0.7028053402900696, + "logps/rejected": -0.5877602100372314, + "loss": 0.6632, + "odds_ratio_loss": 6.588471412658691, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.058776021003723145, + "rewards/margins": 0.011504517868161201, + "rewards/rejected": -0.0702805295586586, + "sft_loss": 0.004390079528093338, + "step": 134 + }, + { + "epoch": 0.7404868015083991, + "grad_norm": 2.213983061915376, + "learning_rate": 1.914947017978153e-06, + "logits/chosen": -0.6579415202140808, + "logits/rejected": -0.6358848214149475, + "logps/chosen": -0.7505279183387756, + "logps/rejected": -0.6064987182617188, + "loss": 0.6827, + "odds_ratio_loss": 6.784019470214844, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06064987555146217, + "rewards/margins": 0.014402923174202442, + "rewards/rejected": -0.07505279779434204, + "sft_loss": 0.004277239087969065, + "step": 135 + }, + { + "epoch": 0.7459718889269797, + "grad_norm": 2.2270439093795993, + "learning_rate": 1.8396875345700498e-06, + "logits/chosen": -0.7001329064369202, + "logits/rejected": -0.6527349948883057, + "logps/chosen": -0.744738757610321, + "logps/rejected": -0.6307080388069153, + "loss": 0.7068, + "odds_ratio_loss": 7.029452323913574, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06307081133127213, + "rewards/margins": 0.011403076350688934, + "rewards/rejected": -0.07447388023138046, + "sft_loss": 0.003887307131662965, + "step": 136 + }, + { + "epoch": 0.7514569763455605, + "grad_norm": 2.256827786206518, + "learning_rate": 1.7656019782304602e-06, + "logits/chosen": -0.7055901288986206, + "logits/rejected": -0.6784175038337708, + "logps/chosen": -0.6831467151641846, + "logps/rejected": -0.5970955491065979, + "loss": 0.675, + "odds_ratio_loss": 6.704926490783691, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05970955267548561, + "rewards/margins": 0.008605118840932846, + "rewards/rejected": -0.06831467151641846, + "sft_loss": 0.004465795587748289, + "step": 137 + }, + { + "epoch": 0.7569420637641412, + "grad_norm": 2.1013122838813394, + "learning_rate": 1.6927178687191953e-06, + "logits/chosen": -0.7317599058151245, + "logits/rejected": -0.7018448114395142, + "logps/chosen": -0.6241502165794373, + "logps/rejected": -0.5972384214401245, + "loss": 0.6753, + "odds_ratio_loss": 6.7175421714782715, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.05972383916378021, + "rewards/margins": 0.00269118370488286, + "rewards/rejected": -0.062415026128292084, + "sft_loss": 0.0035422425717115402, + "step": 138 + }, + { + "epoch": 0.762427151182722, + "grad_norm": 2.042262448509973, + "learning_rate": 1.621062279507617e-06, + "logits/chosen": -0.7065898180007935, + "logits/rejected": -0.6461969614028931, + "logps/chosen": -0.7283148765563965, + "logps/rejected": -0.594868004322052, + "loss": 0.6671, + "odds_ratio_loss": 6.63283634185791, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -0.05948680266737938, + "rewards/margins": 0.01334468089044094, + "rewards/rejected": -0.07283148169517517, + "sft_loss": 0.0038246996700763702, + "step": 139 + }, + { + "epoch": 0.7679122386013028, + "grad_norm": 2.1324897242067022, + "learning_rate": 1.550661827721941e-06, + "logits/chosen": -0.7135088443756104, + "logits/rejected": -0.6793374419212341, + "logps/chosen": -0.811589241027832, + "logps/rejected": -0.6043570041656494, + "loss": 0.6734, + "odds_ratio_loss": 6.6996307373046875, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.0604357086122036, + "rewards/margins": 0.020723219960927963, + "rewards/rejected": -0.08115892857313156, + "sft_loss": 0.0034452907275408506, + "step": 140 + }, + { + "epoch": 0.7733973260198834, + "grad_norm": 1.8928274566112053, + "learning_rate": 1.4815426642560753e-06, + "logits/chosen": -0.7378033399581909, + "logits/rejected": -0.6366161108016968, + "logps/chosen": -0.766107976436615, + "logps/rejected": -0.573859453201294, + "loss": 0.6475, + "odds_ratio_loss": 6.4447150230407715, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.057385943830013275, + "rewards/margins": 0.01922486163675785, + "rewards/rejected": -0.07661080360412598, + "sft_loss": 0.0030438436660915613, + "step": 141 + }, + { + "epoch": 0.7788824134384642, + "grad_norm": 1.9538303248376696, + "learning_rate": 1.4137304640576161e-06, + "logits/chosen": -0.662543535232544, + "logits/rejected": -0.619470477104187, + "logps/chosen": -0.6948879361152649, + "logps/rejected": -0.6252713799476624, + "loss": 0.704, + "odds_ratio_loss": 7.005824089050293, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.062527135014534, + "rewards/margins": 0.006961653009057045, + "rewards/rejected": -0.06948879361152649, + "sft_loss": 0.003440008033066988, + "step": 142 + }, + { + "epoch": 0.7843675008570449, + "grad_norm": 1.8890336186417962, + "learning_rate": 1.3472504165906614e-06, + "logits/chosen": -0.6532924771308899, + "logits/rejected": -0.6276803016662598, + "logps/chosen": -0.6293469071388245, + "logps/rejected": -0.589890718460083, + "loss": 0.6721, + "odds_ratio_loss": 6.69063138961792, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.05898907035589218, + "rewards/margins": 0.003945623058825731, + "rewards/rejected": -0.06293469667434692, + "sft_loss": 0.0030280579812824726, + "step": 143 + }, + { + "epoch": 0.7898525882756257, + "grad_norm": 1.8750003224288405, + "learning_rate": 1.2821272164789544e-06, + "logits/chosen": -0.7127547264099121, + "logits/rejected": -0.6667823791503906, + "logps/chosen": -0.813082754611969, + "logps/rejected": -0.6018574237823486, + "loss": 0.6788, + "odds_ratio_loss": 6.758955478668213, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.0601857453584671, + "rewards/margins": 0.021122539415955544, + "rewards/rejected": -0.0813082829117775, + "sft_loss": 0.0029439451172947884, + "step": 144 + }, + { + "epoch": 0.7953376756942063, + "grad_norm": 2.1968606793011936, + "learning_rate": 1.2183850543328313e-06, + "logits/chosen": -0.708900511264801, + "logits/rejected": -0.689564049243927, + "logps/chosen": -0.673328697681427, + "logps/rejected": -0.5980782508850098, + "loss": 0.6739, + "odds_ratio_loss": 6.683194160461426, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.059807829558849335, + "rewards/margins": 0.007525048218667507, + "rewards/rejected": -0.06733287870883942, + "sft_loss": 0.005550979636609554, + "step": 145 + }, + { + "epoch": 0.8008227631127871, + "grad_norm": 2.3780917622435966, + "learning_rate": 1.156047607763407e-06, + "logits/chosen": -0.7008949518203735, + "logits/rejected": -0.6693176031112671, + "logps/chosen": -0.8738444447517395, + "logps/rejected": -0.6471556425094604, + "loss": 0.7249, + "odds_ratio_loss": 7.202227592468262, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.06471556425094604, + "rewards/margins": 0.022668881341814995, + "rewards/rejected": -0.08738444745540619, + "sft_loss": 0.004629730246961117, + "step": 146 + }, + { + "epoch": 0.8063078505313679, + "grad_norm": 2.0196260017903396, + "learning_rate": 1.095138032587298e-06, + "logits/chosen": -0.6812174916267395, + "logits/rejected": -0.6691566705703735, + "logps/chosen": -0.8704717755317688, + "logps/rejected": -0.5937949419021606, + "loss": 0.6703, + "odds_ratio_loss": 6.671693801879883, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.059379495680332184, + "rewards/margins": 0.027667677029967308, + "rewards/rejected": -0.08704717457294464, + "sft_loss": 0.0031060208566486835, + "step": 147 + }, + { + "epoch": 0.8117929379499486, + "grad_norm": 3.415410597307275, + "learning_rate": 1.0356789542251939e-06, + "logits/chosen": -0.6848728656768799, + "logits/rejected": -0.6237005591392517, + "logps/chosen": -0.8248082995414734, + "logps/rejected": -0.6273688077926636, + "loss": 0.6999, + "odds_ratio_loss": 6.8439483642578125, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.0627368837594986, + "rewards/margins": 0.019743941724300385, + "rewards/rejected": -0.08248083293437958, + "sft_loss": 0.015508507378399372, + "step": 148 + }, + { + "epoch": 0.8172780253685293, + "grad_norm": 2.294183923518428, + "learning_rate": 9.776924592974257e-07, + "logits/chosen": -0.6827746033668518, + "logits/rejected": -0.6851654052734375, + "logps/chosen": -0.6488504409790039, + "logps/rejected": -0.6348989605903625, + "loss": 0.7172, + "odds_ratio_loss": 7.13347053527832, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0634898990392685, + "rewards/margins": 0.0013951496221125126, + "rewards/rejected": -0.06488504260778427, + "sft_loss": 0.003804245265200734, + "step": 149 + }, + { + "epoch": 0.82276311278711, + "grad_norm": 2.2280214785837256, + "learning_rate": 9.212000874196953e-07, + "logits/chosen": -0.6501573920249939, + "logits/rejected": -0.6508501768112183, + "logps/chosen": -0.9070018529891968, + "logps/rejected": -0.5991308093070984, + "loss": 0.6697, + "odds_ratio_loss": 6.647359848022461, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.05991308391094208, + "rewards/margins": 0.030787091702222824, + "rewards/rejected": -0.0907001718878746, + "sft_loss": 0.004916863515973091, + "step": 150 + }, + { + "epoch": 0.8282482002056908, + "grad_norm": 2.1857741774575277, + "learning_rate": 8.662228232019876e-07, + "logits/chosen": -0.6549192667007446, + "logits/rejected": -0.6354272365570068, + "logps/chosen": -0.8838653564453125, + "logps/rejected": -0.6490474939346313, + "loss": 0.7277, + "odds_ratio_loss": 7.240221977233887, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06490474939346313, + "rewards/margins": 0.023481788113713264, + "rewards/rejected": -0.08838653564453125, + "sft_loss": 0.003684336319565773, + "step": 151 + }, + { + "epoch": 0.8337332876242715, + "grad_norm": 1.987517923655104, + "learning_rate": 8.127810884536402e-07, + "logits/chosen": -0.6637884378433228, + "logits/rejected": -0.6511223912239075, + "logps/chosen": -0.7829376459121704, + "logps/rejected": -0.6296254992485046, + "loss": 0.7088, + "odds_ratio_loss": 7.052569389343262, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.06296255439519882, + "rewards/margins": 0.015331214293837547, + "rewards/rejected": -0.07829376310110092, + "sft_loss": 0.003527900902554393, + "step": 152 + }, + { + "epoch": 0.8392183750428522, + "grad_norm": 2.378533291363515, + "learning_rate": 7.60894734597476e-07, + "logits/chosen": -0.7232801914215088, + "logits/rejected": -0.6934952139854431, + "logps/chosen": -0.673111081123352, + "logps/rejected": -0.6153658628463745, + "loss": 0.6995, + "odds_ratio_loss": 6.946869850158691, + "rewards/accuracies": 0.3671875, + "rewards/chosen": -0.06153658404946327, + "rewards/margins": 0.005774518009275198, + "rewards/rejected": -0.06731110066175461, + "sft_loss": 0.00481205340474844, + "step": 153 + }, + { + "epoch": 0.844703462461433, + "grad_norm": 2.105440987324829, + "learning_rate": 7.105830352958143e-07, + "logits/chosen": -0.6993785500526428, + "logits/rejected": -0.6673418879508972, + "logps/chosen": -0.8246726989746094, + "logps/rejected": -0.6575340032577515, + "loss": 0.7351, + "odds_ratio_loss": 7.309325695037842, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06575340777635574, + "rewards/margins": 0.016713865101337433, + "rewards/rejected": -0.08246727287769318, + "sft_loss": 0.004190374165773392, + "step": 154 + }, + { + "epoch": 0.8501885498800137, + "grad_norm": 2.0837906888947284, + "learning_rate": 6.618646792910893e-07, + "logits/chosen": -0.6814290881156921, + "logits/rejected": -0.6457995772361755, + "logps/chosen": -0.7051143050193787, + "logps/rejected": -0.5800508856773376, + "loss": 0.6505, + "odds_ratio_loss": 6.475788593292236, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.058005087077617645, + "rewards/margins": 0.012506341561675072, + "rewards/rejected": -0.07051143050193787, + "sft_loss": 0.002879747888073325, + "step": 155 + }, + { + "epoch": 0.8556736372985945, + "grad_norm": 2.3480039986855235, + "learning_rate": 6.147577634637413e-07, + "logits/chosen": -0.709331214427948, + "logits/rejected": -0.6663718223571777, + "logps/chosen": -0.7591659426689148, + "logps/rejected": -0.6053435802459717, + "loss": 0.6828, + "odds_ratio_loss": 6.782125949859619, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.060534361749887466, + "rewards/margins": 0.015382234007120132, + "rewards/rejected": -0.0759166032075882, + "sft_loss": 0.004561963025480509, + "step": 156 + }, + { + "epoch": 0.8611587247171751, + "grad_norm": 2.0141939824459105, + "learning_rate": 5.692797861099719e-07, + "logits/chosen": -0.6299980282783508, + "logits/rejected": -0.6576442122459412, + "logps/chosen": -0.6320803761482239, + "logps/rejected": -0.5932613611221313, + "loss": 0.6692, + "odds_ratio_loss": 6.658140659332275, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.059326138347387314, + "rewards/margins": 0.0038819059263914824, + "rewards/rejected": -0.06320804357528687, + "sft_loss": 0.0034193547908216715, + "step": 157 + }, + { + "epoch": 0.8666438121357559, + "grad_norm": 2.117743395165982, + "learning_rate": 5.254476404418341e-07, + "logits/chosen": -0.68840092420578, + "logits/rejected": -0.661196231842041, + "logps/chosen": -0.740635871887207, + "logps/rejected": -0.5854509472846985, + "loss": 0.6559, + "odds_ratio_loss": 6.521589756011963, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -0.05854509398341179, + "rewards/margins": 0.015518485568463802, + "rewards/rejected": -0.07406358420848846, + "sft_loss": 0.003763651242479682, + "step": 158 + }, + { + "epoch": 0.8721288995543367, + "grad_norm": 2.0431260947506273, + "learning_rate": 4.832776083120983e-07, + "logits/chosen": -0.6590278148651123, + "logits/rejected": -0.5882821679115295, + "logps/chosen": -0.6959192752838135, + "logps/rejected": -0.5968501567840576, + "loss": 0.6729, + "odds_ratio_loss": 6.690352916717529, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.05968501418828964, + "rewards/margins": 0.009906908497214317, + "rewards/rejected": -0.06959191709756851, + "sft_loss": 0.0038192402571439743, + "step": 159 + }, + { + "epoch": 0.8776139869729174, + "grad_norm": 2.06927462291822, + "learning_rate": 4.4278535416620914e-07, + "logits/chosen": -0.7277648448944092, + "logits/rejected": -0.6717500686645508, + "logps/chosen": -0.6366525888442993, + "logps/rejected": -0.5973187685012817, + "loss": 0.6734, + "odds_ratio_loss": 6.694884300231934, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.059731870889663696, + "rewards/margins": 0.003933383151888847, + "rewards/rejected": -0.06366526335477829, + "sft_loss": 0.003877094015479088, + "step": 160 + }, + { + "epoch": 0.8830990743914982, + "grad_norm": 2.5991781469777684, + "learning_rate": 4.0398591922357787e-07, + "logits/chosen": -0.6706148386001587, + "logits/rejected": -0.6340373158454895, + "logps/chosen": -0.7028053402900696, + "logps/rejected": -0.5866550207138062, + "loss": 0.6609, + "odds_ratio_loss": 6.551386833190918, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.05866550654172897, + "rewards/margins": 0.011615024879574776, + "rewards/rejected": -0.0702805295586586, + "sft_loss": 0.005807496141642332, + "step": 161 + }, + { + "epoch": 0.8885841618100788, + "grad_norm": 1.9365443243739644, + "learning_rate": 3.6689371589039013e-07, + "logits/chosen": -0.66963130235672, + "logits/rejected": -0.5979880690574646, + "logps/chosen": -0.7448896169662476, + "logps/rejected": -0.6330960392951965, + "loss": 0.7074, + "odds_ratio_loss": 7.0415358543396, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.06330960988998413, + "rewards/margins": 0.011179355904459953, + "rewards/rejected": -0.07448896020650864, + "sft_loss": 0.00325138121843338, + "step": 162 + }, + { + "epoch": 0.8940692492286596, + "grad_norm": 2.121200239152531, + "learning_rate": 3.315225224059809e-07, + "logits/chosen": -0.6617680191993713, + "logits/rejected": -0.6018663644790649, + "logps/chosen": -0.8706423044204712, + "logps/rejected": -0.6283361911773682, + "loss": 0.7038, + "odds_ratio_loss": 6.99859094619751, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06283361464738846, + "rewards/margins": 0.024230612441897392, + "rewards/rejected": -0.087064228951931, + "sft_loss": 0.003959609195590019, + "step": 163 + }, + { + "epoch": 0.8995543366472403, + "grad_norm": 2.148488564632459, + "learning_rate": 2.9788547772478416e-07, + "logits/chosen": -0.6609708666801453, + "logits/rejected": -0.6172842979431152, + "logps/chosen": -0.7381229400634766, + "logps/rejected": -0.6342223882675171, + "loss": 0.7124, + "odds_ratio_loss": 7.068850040435791, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06342223286628723, + "rewards/margins": 0.010390058159828186, + "rewards/rejected": -0.07381229847669601, + "sft_loss": 0.005511356517672539, + "step": 164 + }, + { + "epoch": 0.905039424065821, + "grad_norm": 2.1106479655528796, + "learning_rate": 2.6599507663574387e-07, + "logits/chosen": -0.6930041313171387, + "logits/rejected": -0.666252851486206, + "logps/chosen": -0.6971046328544617, + "logps/rejected": -0.6307591199874878, + "loss": 0.7082, + "odds_ratio_loss": 7.04301643371582, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.06307591497898102, + "rewards/margins": 0.006634547375142574, + "rewards/rejected": -0.06971046328544617, + "sft_loss": 0.003914778120815754, + "step": 165 + }, + { + "epoch": 0.9105245114844018, + "grad_norm": 2.12980458488719, + "learning_rate": 2.3586316512101416e-07, + "logits/chosen": -0.6665065288543701, + "logits/rejected": -0.6230036616325378, + "logps/chosen": -0.7909663915634155, + "logps/rejected": -0.5812108516693115, + "loss": 0.6543, + "odds_ratio_loss": 6.501043796539307, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05812108889222145, + "rewards/margins": 0.02097555249929428, + "rewards/rejected": -0.07909663766622543, + "sft_loss": 0.004211761057376862, + "step": 166 + }, + { + "epoch": 0.9160095989029825, + "grad_norm": 1.9925848439582086, + "learning_rate": 2.0750093595565735e-07, + "logits/chosen": -0.6866216659545898, + "logits/rejected": -0.6430922150611877, + "logps/chosen": -0.7136643528938293, + "logps/rejected": -0.6055830121040344, + "loss": 0.681, + "odds_ratio_loss": 6.776293754577637, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.06055830419063568, + "rewards/margins": 0.010808136314153671, + "rewards/rejected": -0.07136643677949905, + "sft_loss": 0.0033255324233323336, + "step": 167 + }, + { + "epoch": 0.9214946863215633, + "grad_norm": 1.7865099751357647, + "learning_rate": 1.8091892454998595e-07, + "logits/chosen": -0.7047858238220215, + "logits/rejected": -0.6222947835922241, + "logps/chosen": -0.8674630522727966, + "logps/rejected": -0.635217547416687, + "loss": 0.7125, + "odds_ratio_loss": 7.095528602600098, + "rewards/accuracies": 0.4140625, + "rewards/chosen": -0.06352175027132034, + "rewards/margins": 0.023224547505378723, + "rewards/rejected": -0.08674629777669907, + "sft_loss": 0.0029704533517360687, + "step": 168 + }, + { + "epoch": 0.926979773740144, + "grad_norm": 2.4300450960067157, + "learning_rate": 1.561270050360897e-07, + "logits/chosen": -0.6807259321212769, + "logits/rejected": -0.6181145310401917, + "logps/chosen": -0.939436674118042, + "logps/rejected": -0.6324220895767212, + "loss": 0.7055, + "odds_ratio_loss": 7.014206886291504, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.06324220448732376, + "rewards/margins": 0.03070145845413208, + "rewards/rejected": -0.09394367039203644, + "sft_loss": 0.0040901475585997105, + "step": 169 + }, + { + "epoch": 0.9324648611587247, + "grad_norm": 1.9641250789279827, + "learning_rate": 1.33134386599994e-07, + "logits/chosen": -0.7125731110572815, + "logits/rejected": -0.673245370388031, + "logps/chosen": -0.9093432426452637, + "logps/rejected": -0.6548852324485779, + "loss": 0.7325, + "odds_ratio_loss": 7.290490627288818, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.06548852473497391, + "rewards/margins": 0.025445803999900818, + "rewards/rejected": -0.09093432873487473, + "sft_loss": 0.003472922369837761, + "step": 170 + }, + { + "epoch": 0.9379499485773054, + "grad_norm": 2.2978574983286086, + "learning_rate": 1.1194961006082972e-07, + "logits/chosen": -0.6797432899475098, + "logits/rejected": -0.6302465200424194, + "logps/chosen": -0.826444685459137, + "logps/rejected": -0.6436266899108887, + "loss": 0.7225, + "odds_ratio_loss": 7.156667709350586, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.06436267495155334, + "rewards/margins": 0.01828179880976677, + "rewards/rejected": -0.08264447748661041, + "sft_loss": 0.006842226721346378, + "step": 171 + }, + { + "epoch": 0.9434350359958862, + "grad_norm": 2.3466230771755594, + "learning_rate": 9.258054469825972e-08, + "logits/chosen": -0.7173478007316589, + "logits/rejected": -0.6696880459785461, + "logps/chosen": -0.6279891133308411, + "logps/rejected": -0.6100993752479553, + "loss": 0.6933, + "odds_ratio_loss": 6.895969390869141, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.06100994348526001, + "rewards/margins": 0.0017889718292281032, + "rewards/rejected": -0.06279890984296799, + "sft_loss": 0.0036887084133923054, + "step": 172 + }, + { + "epoch": 0.948920123414467, + "grad_norm": 2.2629683724373226, + "learning_rate": 7.503438532937169e-08, + "logits/chosen": -0.6861296892166138, + "logits/rejected": -0.6311776041984558, + "logps/chosen": -0.6609523296356201, + "logps/rejected": -0.6280564069747925, + "loss": 0.7091, + "odds_ratio_loss": 7.0425190925598145, + "rewards/accuracies": 0.3984375, + "rewards/chosen": -0.0628056451678276, + "rewards/margins": 0.003289599437266588, + "rewards/rejected": -0.06609524041414261, + "sft_loss": 0.00483954232186079, + "step": 173 + }, + { + "epoch": 0.9544052108330476, + "grad_norm": 2.3036615374451106, + "learning_rate": 5.9317649636088656e-08, + "logits/chosen": -0.6945533156394958, + "logits/rejected": -0.6203831434249878, + "logps/chosen": -0.8298579454421997, + "logps/rejected": -0.6409691572189331, + "loss": 0.7149, + "odds_ratio_loss": 7.09760856628418, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.06409691274166107, + "rewards/margins": 0.018888890743255615, + "rewards/rejected": -0.08298580348491669, + "sft_loss": 0.005160990636795759, + "step": 174 + }, + { + "epoch": 0.9598902982516284, + "grad_norm": 2.450600909672264, + "learning_rate": 4.543617574412185e-08, + "logits/chosen": -0.7120048403739929, + "logits/rejected": -0.6609210968017578, + "logps/chosen": -0.8656575679779053, + "logps/rejected": -0.6917891502380371, + "loss": 0.7722, + "odds_ratio_loss": 7.664754390716553, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.06917892396450043, + "rewards/margins": 0.01738683693110943, + "rewards/rejected": -0.08656575530767441, + "sft_loss": 0.0056978315114974976, + "step": 175 + }, + { + "epoch": 0.9653753856702091, + "grad_norm": 2.0318496856067942, + "learning_rate": 3.339512005434309e-08, + "logits/chosen": -0.6582708954811096, + "logits/rejected": -0.5941011905670166, + "logps/chosen": -0.7900950908660889, + "logps/rejected": -0.5978555083274841, + "loss": 0.6728, + "odds_ratio_loss": 6.6846771240234375, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.05978555232286453, + "rewards/margins": 0.019223961979150772, + "rewards/rejected": -0.079009510576725, + "sft_loss": 0.004282682668417692, + "step": 176 + }, + { + "epoch": 0.9708604730887899, + "grad_norm": 2.101347721071758, + "learning_rate": 2.319895532739369e-08, + "logits/chosen": -0.6375358700752258, + "logits/rejected": -0.6700392961502075, + "logps/chosen": -0.7632755637168884, + "logps/rejected": -0.653388261795044, + "loss": 0.7326, + "odds_ratio_loss": 7.288682460784912, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.06533882021903992, + "rewards/margins": 0.010988726280629635, + "rewards/rejected": -0.07632754743099213, + "sft_loss": 0.003733065677806735, + "step": 177 + }, + { + "epoch": 0.9763455605073705, + "grad_norm": 2.1469363578193033, + "learning_rate": 1.4851469022234e-08, + "logits/chosen": -0.6644502878189087, + "logits/rejected": -0.6307498216629028, + "logps/chosen": -0.6507246494293213, + "logps/rejected": -0.5957509875297546, + "loss": 0.6743, + "odds_ratio_loss": 6.702306270599365, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.059575099498033524, + "rewards/margins": 0.00549736525863409, + "rewards/rejected": -0.06507246941328049, + "sft_loss": 0.004071300383657217, + "step": 178 + }, + { + "epoch": 0.9818306479259513, + "grad_norm": 2.2611714428995313, + "learning_rate": 8.35576188926046e-09, + "logits/chosen": -0.685587465763092, + "logits/rejected": -0.6573466658592224, + "logps/chosen": -0.7864770889282227, + "logps/rejected": -0.5809304714202881, + "loss": 0.6574, + "odds_ratio_loss": 6.511574745178223, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.058093056082725525, + "rewards/margins": 0.020554661750793457, + "rewards/rejected": -0.07864771038293839, + "sft_loss": 0.006258727982640266, + "step": 179 + }, + { + "epoch": 0.9873157353445321, + "grad_norm": 2.10857188318165, + "learning_rate": 3.71424681850141e-09, + "logits/chosen": -0.7009637355804443, + "logits/rejected": -0.7090278267860413, + "logps/chosen": -0.7139046788215637, + "logps/rejected": -0.6147310137748718, + "loss": 0.6931, + "odds_ratio_loss": 6.891450881958008, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.06147310137748718, + "rewards/margins": 0.009917369112372398, + "rewards/rejected": -0.07139047980308533, + "sft_loss": 0.0039411671459674835, + "step": 180 + }, + { + "epoch": 0.9928008227631128, + "grad_norm": 2.18736540958863, + "learning_rate": 9.286479433257e-10, + "logits/chosen": -0.6564709544181824, + "logits/rejected": -0.5686830878257751, + "logps/chosen": -0.8073873519897461, + "logps/rejected": -0.573136031627655, + "loss": 0.6412, + "odds_ratio_loss": 6.370337009429932, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.05731360614299774, + "rewards/margins": 0.023425137624144554, + "rewards/rejected": -0.08073873817920685, + "sft_loss": 0.004156089387834072, + "step": 181 + }, + { + "epoch": 0.9982859101816935, + "grad_norm": 2.121915449515644, + "learning_rate": 0.0, + "logits/chosen": -0.7368127703666687, + "logits/rejected": -0.6576196551322937, + "logps/chosen": -1.124916672706604, + "logps/rejected": -0.5984194278717041, + "loss": 0.6649, + "odds_ratio_loss": 6.61328125, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.05984194204211235, + "rewards/margins": 0.05264972895383835, + "rewards/rejected": -0.1124916821718216, + "sft_loss": 0.0035864519886672497, + "step": 182 + }, + { + "epoch": 0.9982859101816935, + "step": 182, + "total_flos": 58779245903872.0, + "train_loss": 0.7302135016236987, + "train_runtime": 13700.9712, + "train_samples_per_second": 1.703, + "train_steps_per_second": 0.013 + } + ], + "logging_steps": 1, + "max_steps": 182, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 182, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 58779245903872.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}