{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990762978015888, "eval_steps": 400, "global_step": 507, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001970564689943962, "grad_norm": 3.539861305637653, "learning_rate": 9.803921568627451e-09, "logits/chosen": -0.03196336328983307, "logits/rejected": -0.15967734158039093, "logps/chosen": -99.96153259277344, "logps/rejected": -93.94828033447266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00985282344971981, "grad_norm": 3.74004975366523, "learning_rate": 4.901960784313725e-08, "logits/chosen": -0.042198292911052704, "logits/rejected": -0.34676456451416016, "logps/chosen": -112.20402526855469, "logps/rejected": -101.837646484375, "loss": 0.6932, "rewards/accuracies": 0.421875, "rewards/chosen": 0.0011955354129895568, "rewards/margins": 0.0004524323157966137, "rewards/rejected": 0.0007431029807776213, "step": 5 }, { "epoch": 0.01970564689943962, "grad_norm": 3.5769285168326417, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.16411139070987701, "logits/rejected": -0.29173845052719116, "logps/chosen": -94.1719741821289, "logps/rejected": -96.02313232421875, "loss": 0.6933, "rewards/accuracies": 0.4375, "rewards/chosen": 4.2173640395049006e-05, "rewards/margins": -0.001043520518578589, "rewards/rejected": 0.001085694064386189, "step": 10 }, { "epoch": 0.02955847034915943, "grad_norm": 3.3653465070579673, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -0.07717452943325043, "logits/rejected": -0.3865337371826172, "logps/chosen": -100.953125, "logps/rejected": -92.61229705810547, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.00025451680994592607, "rewards/margins": -0.0001873960136435926, "rewards/rejected": -6.712078902637586e-05, "step": 15 }, { "epoch": 0.03941129379887924, "grad_norm": 3.373649973394512, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.0971444696187973, "logits/rejected": -0.35578858852386475, "logps/chosen": -106.3487319946289, "logps/rejected": -102.8326416015625, "loss": 0.6929, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0003346280718687922, "rewards/margins": 0.0001382694172207266, "rewards/rejected": -0.00047289757640101016, "step": 20 }, { "epoch": 0.049264117248599054, "grad_norm": 3.224755352064536, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -0.10794611275196075, "logits/rejected": -0.29162880778312683, "logps/chosen": -99.07095336914062, "logps/rejected": -95.20055389404297, "loss": 0.6927, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.00042314338497817516, "rewards/margins": 0.0012360246619209647, "rewards/rejected": -0.0008128813351504505, "step": 25 }, { "epoch": 0.05911694069831886, "grad_norm": 3.444342352448875, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.12300117313861847, "logits/rejected": -0.27830368280410767, "logps/chosen": -105.83805847167969, "logps/rejected": -104.1891860961914, "loss": 0.6914, "rewards/accuracies": 0.75, "rewards/chosen": 0.002045437227934599, "rewards/margins": 0.004656647797673941, "rewards/rejected": -0.002611211035400629, "step": 30 }, { "epoch": 0.06896976414803867, "grad_norm": 3.5299163808469496, "learning_rate": 3.431372549019608e-07, "logits/chosen": -0.03614411875605583, "logits/rejected": -0.3109976053237915, "logps/chosen": -99.59745788574219, "logps/rejected": -98.72537231445312, "loss": 0.6903, "rewards/accuracies": 0.6875, "rewards/chosen": 0.000885780609678477, "rewards/margins": 0.0061457473784685135, "rewards/rejected": -0.005259966477751732, "step": 35 }, { "epoch": 0.07882258759775848, "grad_norm": 3.9275316344127242, "learning_rate": 3.92156862745098e-07, "logits/chosen": -0.08717192709445953, "logits/rejected": -0.30741095542907715, "logps/chosen": -99.77064514160156, "logps/rejected": -96.07014465332031, "loss": 0.6882, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0018755150958895683, "rewards/margins": 0.007984376512467861, "rewards/rejected": -0.00985989160835743, "step": 40 }, { "epoch": 0.0886754110474783, "grad_norm": 3.5883285534109493, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -0.032524555921554565, "logits/rejected": -0.3046508729457855, "logps/chosen": -96.41732788085938, "logps/rejected": -95.6390380859375, "loss": 0.6842, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.005956724286079407, "rewards/margins": 0.01741139218211174, "rewards/rejected": -0.023368116468191147, "step": 45 }, { "epoch": 0.09852823449719811, "grad_norm": 3.3794927211727392, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.12240082025527954, "logits/rejected": -0.28499796986579895, "logps/chosen": -106.6746597290039, "logps/rejected": -104.4252700805664, "loss": 0.6811, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.017990436404943466, "rewards/margins": 0.024439355358481407, "rewards/rejected": -0.042429789900779724, "step": 50 }, { "epoch": 0.10838105794691791, "grad_norm": 3.8844805866458234, "learning_rate": 4.999050767562379e-07, "logits/chosen": -0.06140371039509773, "logits/rejected": -0.36449694633483887, "logps/chosen": -112.74903869628906, "logps/rejected": -107.06253814697266, "loss": 0.6729, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.048891425132751465, "rewards/margins": 0.04517052322626114, "rewards/rejected": -0.0940619483590126, "step": 55 }, { "epoch": 0.11823388139663772, "grad_norm": 4.246075688346676, "learning_rate": 4.99519574616467e-07, "logits/chosen": -0.08420858532190323, "logits/rejected": -0.2296031415462494, "logps/chosen": -106.52708435058594, "logps/rejected": -114.71268463134766, "loss": 0.6598, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.0871507003903389, "rewards/margins": 0.07720854133367538, "rewards/rejected": -0.16435924172401428, "step": 60 }, { "epoch": 0.12808670484635754, "grad_norm": 4.078056870628684, "learning_rate": 4.988380179235842e-07, "logits/chosen": -0.04726668819785118, "logits/rejected": -0.2177656590938568, "logps/chosen": -117.36442565917969, "logps/rejected": -122.31159973144531, "loss": 0.6488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15060417354106903, "rewards/margins": 0.0838489979505539, "rewards/rejected": -0.23445317149162292, "step": 65 }, { "epoch": 0.13793952829607734, "grad_norm": 3.820719733352528, "learning_rate": 4.978612153434526e-07, "logits/chosen": -0.05979006737470627, "logits/rejected": -0.19735023379325867, "logps/chosen": -124.60560607910156, "logps/rejected": -155.1808624267578, "loss": 0.6318, "rewards/accuracies": 0.75, "rewards/chosen": -0.2229953557252884, "rewards/margins": 0.31397417187690735, "rewards/rejected": -0.5369695425033569, "step": 70 }, { "epoch": 0.14779235174579716, "grad_norm": 5.031693747430946, "learning_rate": 4.965903258506806e-07, "logits/chosen": -0.059552647173404694, "logits/rejected": -0.19415248930454254, "logps/chosen": -136.63735961914062, "logps/rejected": -191.6803741455078, "loss": 0.6086, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.35170239210128784, "rewards/margins": 0.49619174003601074, "rewards/rejected": -0.8478941917419434, "step": 75 }, { "epoch": 0.15764517519551696, "grad_norm": 5.160412651014448, "learning_rate": 4.950268573535011e-07, "logits/chosen": -0.0038110867608338594, "logits/rejected": -0.15630409121513367, "logps/chosen": -147.3404083251953, "logps/rejected": -173.90728759765625, "loss": 0.5967, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4520920217037201, "rewards/margins": 0.27063217759132385, "rewards/rejected": -0.7227243185043335, "step": 80 }, { "epoch": 0.16749799864523676, "grad_norm": 6.145536461291785, "learning_rate": 4.93172664904641e-07, "logits/chosen": 0.01297803781926632, "logits/rejected": -0.1908242255449295, "logps/chosen": -167.2682647705078, "logps/rejected": -198.43177795410156, "loss": 0.5701, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.589021623134613, "rewards/margins": 0.3708820641040802, "rewards/rejected": -0.9599035978317261, "step": 85 }, { "epoch": 0.1773508220949566, "grad_norm": 60.6627948159201, "learning_rate": 4.910299485003033e-07, "logits/chosen": -0.007486692164093256, "logits/rejected": -0.18025372922420502, "logps/chosen": -183.25140380859375, "logps/rejected": -288.912109375, "loss": 0.5755, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8218961954116821, "rewards/margins": 1.0762311220169067, "rewards/rejected": -1.8981273174285889, "step": 90 }, { "epoch": 0.1872036455446764, "grad_norm": 6.619891594866407, "learning_rate": 4.886012504698769e-07, "logits/chosen": -0.00334315188229084, "logits/rejected": -0.3066111207008362, "logps/chosen": -222.06753540039062, "logps/rejected": -272.4456481933594, "loss": 0.5397, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1969377994537354, "rewards/margins": 0.5773923397064209, "rewards/rejected": -1.7743301391601562, "step": 95 }, { "epoch": 0.19705646899439622, "grad_norm": 6.740434195327616, "learning_rate": 4.858894524594652e-07, "logits/chosen": -0.1177954450249672, "logits/rejected": -0.3193029761314392, "logps/chosen": -216.84396362304688, "logps/rejected": -431.8846740722656, "loss": 0.5071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.120563268661499, "rewards/margins": 2.212986707687378, "rewards/rejected": -3.333550214767456, "step": 100 }, { "epoch": 0.20690929244411602, "grad_norm": 7.617693374089276, "learning_rate": 4.828977720128198e-07, "logits/chosen": -0.1505376100540161, "logits/rejected": -0.3399549126625061, "logps/chosen": -235.60617065429688, "logps/rejected": -358.7390441894531, "loss": 0.497, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3088951110839844, "rewards/margins": 1.3302855491638184, "rewards/rejected": -2.639180898666382, "step": 105 }, { "epoch": 0.21676211589383582, "grad_norm": 7.105788785064987, "learning_rate": 4.796297587537285e-07, "logits/chosen": -0.1506040096282959, "logits/rejected": -0.3250656723976135, "logps/chosen": -257.82464599609375, "logps/rejected": -416.75958251953125, "loss": 0.4453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5325710773468018, "rewards/margins": 1.6099077463150024, "rewards/rejected": -3.1424789428710938, "step": 110 }, { "epoch": 0.22661493934355564, "grad_norm": 8.60769452133117, "learning_rate": 4.760892901743944e-07, "logits/chosen": -0.13159573078155518, "logits/rejected": -0.3428110182285309, "logps/chosen": -300.0444030761719, "logps/rejected": -514.811279296875, "loss": 0.4607, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9326972961425781, "rewards/margins": 2.217697858810425, "rewards/rejected": -4.150395393371582, "step": 115 }, { "epoch": 0.23646776279327544, "grad_norm": 10.209138794878942, "learning_rate": 4.7228056703479626e-07, "logits/chosen": -0.16759036481380463, "logits/rejected": -0.3990747332572937, "logps/chosen": -299.72705078125, "logps/rejected": -433.87823486328125, "loss": 0.4381, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9569202661514282, "rewards/margins": 1.37308669090271, "rewards/rejected": -3.3300068378448486, "step": 120 }, { "epoch": 0.24632058624299527, "grad_norm": 28.609743111268017, "learning_rate": 4.6820810837849535e-07, "logits/chosen": -0.20535437762737274, "logits/rejected": -0.4542999267578125, "logps/chosen": -307.52362060546875, "logps/rejected": -520.55322265625, "loss": 0.4403, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.0457329750061035, "rewards/margins": 2.181645393371582, "rewards/rejected": -4.227377891540527, "step": 125 }, { "epoch": 0.25617340969271507, "grad_norm": 11.650217681846684, "learning_rate": 4.63876746170797e-07, "logits/chosen": -0.23113389313220978, "logits/rejected": -0.47989240288734436, "logps/chosen": -344.0136413574219, "logps/rejected": -512.1621704101562, "loss": 0.474, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.392538547515869, "rewards/margins": 1.7528173923492432, "rewards/rejected": -4.145355701446533, "step": 130 }, { "epoch": 0.2660262331424349, "grad_norm": 12.280394789341818, "learning_rate": 4.592916195656321e-07, "logits/chosen": -0.2849624454975128, "logits/rejected": -0.4641537070274353, "logps/chosen": -360.0565490722656, "logps/rejected": -593.3041381835938, "loss": 0.4143, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.587618112564087, "rewards/margins": 2.348435401916504, "rewards/rejected": -4.936053276062012, "step": 135 }, { "epoch": 0.27587905659215467, "grad_norm": 11.781977035196428, "learning_rate": 4.544581688079602e-07, "logits/chosen": -0.27669793367385864, "logits/rejected": -0.4249224066734314, "logps/chosen": -346.13238525390625, "logps/rejected": -542.8922119140625, "loss": 0.3764, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.43330454826355, "rewards/margins": 1.9707410335540771, "rewards/rejected": -4.404046058654785, "step": 140 }, { "epoch": 0.2857318800418745, "grad_norm": 12.725521218797944, "learning_rate": 4.493821287789272e-07, "logits/chosen": -0.29486262798309326, "logits/rejected": -0.45644649863243103, "logps/chosen": -350.2372131347656, "logps/rejected": -533.0991821289062, "loss": 0.3874, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.510737657546997, "rewards/margins": 1.8067106008529663, "rewards/rejected": -4.317447662353516, "step": 145 }, { "epoch": 0.2955847034915943, "grad_norm": 10.47948759187309, "learning_rate": 4.4406952219143934e-07, "logits/chosen": -0.20662228763103485, "logits/rejected": -0.47787055373191833, "logps/chosen": -331.3938903808594, "logps/rejected": -496.76922607421875, "loss": 0.3901, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2830259799957275, "rewards/margins": 1.7167097330093384, "rewards/rejected": -3.9997353553771973, "step": 150 }, { "epoch": 0.3054375269413141, "grad_norm": 9.243554014229469, "learning_rate": 4.38526652444224e-07, "logits/chosen": -0.23520083725452423, "logits/rejected": -0.45836538076400757, "logps/chosen": -338.83935546875, "logps/rejected": -600.7113037109375, "loss": 0.3776, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.335677146911621, "rewards/margins": 2.6365208625793457, "rewards/rejected": -4.972198009490967, "step": 155 }, { "epoch": 0.3152903503910339, "grad_norm": 16.302237845544973, "learning_rate": 4.3276009614285824e-07, "logits/chosen": -0.27397865056991577, "logits/rejected": -0.4509497582912445, "logps/chosen": -340.3817443847656, "logps/rejected": -519.3179321289062, "loss": 0.373, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.389813184738159, "rewards/margins": 1.820207953453064, "rewards/rejected": -4.210021018981934, "step": 160 }, { "epoch": 0.32514317384075375, "grad_norm": 12.790316412360088, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.3117635250091553, "logits/rejected": -0.5069926977157593, "logps/chosen": -408.1647033691406, "logps/rejected": -684.1016845703125, "loss": 0.3554, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.9762721061706543, "rewards/margins": 2.816100597381592, "rewards/rejected": -5.792372703552246, "step": 165 }, { "epoch": 0.3349959972904735, "grad_norm": 12.163515409448873, "learning_rate": 4.2058354920054043e-07, "logits/chosen": -0.32501062750816345, "logits/rejected": -0.4444299340248108, "logps/chosen": -392.1680603027344, "logps/rejected": -582.3343505859375, "loss": 0.3309, "rewards/accuracies": 0.8125, "rewards/chosen": -2.916874885559082, "rewards/margins": 1.9237887859344482, "rewards/rejected": -4.840663909912109, "step": 170 }, { "epoch": 0.34484882074019335, "grad_norm": 16.509396763840595, "learning_rate": 4.141880060119336e-07, "logits/chosen": -0.30287298560142517, "logits/rejected": -0.5002428293228149, "logps/chosen": -392.53900146484375, "logps/rejected": -621.8858642578125, "loss": 0.3799, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.818398952484131, "rewards/margins": 2.3253607749938965, "rewards/rejected": -5.143759250640869, "step": 175 }, { "epoch": 0.3547016441899132, "grad_norm": 19.861599589919685, "learning_rate": 4.0759765403198877e-07, "logits/chosen": -0.2992296814918518, "logits/rejected": -0.46128687262535095, "logps/chosen": -369.04058837890625, "logps/rejected": -594.5716552734375, "loss": 0.3318, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.6274733543395996, "rewards/margins": 2.262349843978882, "rewards/rejected": -4.889822959899902, "step": 180 }, { "epoch": 0.364554467639633, "grad_norm": 20.974452355570566, "learning_rate": 4.008203127021797e-07, "logits/chosen": -0.25085026025772095, "logits/rejected": -0.46279406547546387, "logps/chosen": -358.7597351074219, "logps/rejected": -571.667724609375, "loss": 0.3218, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.5963218212127686, "rewards/margins": 2.143275022506714, "rewards/rejected": -4.739596843719482, "step": 185 }, { "epoch": 0.3744072910893528, "grad_norm": 14.073373162388602, "learning_rate": 3.9386402332652754e-07, "logits/chosen": -0.23608064651489258, "logits/rejected": -0.5011879205703735, "logps/chosen": -423.06573486328125, "logps/rejected": -701.5946044921875, "loss": 0.3171, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.2071080207824707, "rewards/margins": 2.7721924781799316, "rewards/rejected": -5.979300498962402, "step": 190 }, { "epoch": 0.3842601145390726, "grad_norm": 22.462855346029198, "learning_rate": 3.867370395306068e-07, "logits/chosen": -0.21024306118488312, "logits/rejected": -0.48892831802368164, "logps/chosen": -421.2330627441406, "logps/rejected": -660.6759643554688, "loss": 0.3264, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2450497150421143, "rewards/margins": 2.429110527038574, "rewards/rejected": -5.674160003662109, "step": 195 }, { "epoch": 0.39411293798879243, "grad_norm": 12.932519297904385, "learning_rate": 3.794478174686328e-07, "logits/chosen": -0.29106825590133667, "logits/rejected": -0.4926213324069977, "logps/chosen": -411.04840087890625, "logps/rejected": -645.6761474609375, "loss": 0.344, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0619587898254395, "rewards/margins": 2.4115443229675293, "rewards/rejected": -5.473503112792969, "step": 200 }, { "epoch": 0.4039657614385122, "grad_norm": 12.885808088365131, "learning_rate": 3.720050057902495e-07, "logits/chosen": -0.28087863326072693, "logits/rejected": -0.5519760847091675, "logps/chosen": -393.69659423828125, "logps/rejected": -613.2288208007812, "loss": 0.3363, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.869983196258545, "rewards/margins": 2.265076160430908, "rewards/rejected": -5.135059356689453, "step": 205 }, { "epoch": 0.41381858488823203, "grad_norm": 16.468484087079414, "learning_rate": 3.644174353789204e-07, "logits/chosen": -0.3173820376396179, "logits/rejected": -0.41476958990097046, "logps/chosen": -439.092529296875, "logps/rejected": -734.161865234375, "loss": 0.2861, "rewards/accuracies": 0.84375, "rewards/chosen": -3.3481502532958984, "rewards/margins": 2.985670566558838, "rewards/rejected": -6.3338212966918945, "step": 210 }, { "epoch": 0.42367140833795186, "grad_norm": 22.743564527030898, "learning_rate": 3.566941088741009e-07, "logits/chosen": -0.32451528310775757, "logits/rejected": -0.4603727459907532, "logps/chosen": -506.1102600097656, "logps/rejected": -843.5612182617188, "loss": 0.3019, "rewards/accuracies": 0.875, "rewards/chosen": -4.0731306076049805, "rewards/margins": 3.3347747325897217, "rewards/rejected": -7.407905578613281, "step": 215 }, { "epoch": 0.43352423178767163, "grad_norm": 13.898026449525894, "learning_rate": 3.488441899896217e-07, "logits/chosen": -0.36527490615844727, "logits/rejected": -0.5059664845466614, "logps/chosen": -498.8099670410156, "logps/rejected": -819.1517333984375, "loss": 0.2831, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.9226620197296143, "rewards/margins": 3.1661388874053955, "rewards/rejected": -7.088801383972168, "step": 220 }, { "epoch": 0.44337705523739146, "grad_norm": 13.52248766735538, "learning_rate": 3.408769926409574e-07, "logits/chosen": -0.2923319637775421, "logits/rejected": -0.5522831082344055, "logps/chosen": -440.209716796875, "logps/rejected": -757.4591064453125, "loss": 0.2934, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.353994369506836, "rewards/margins": 3.2045319080352783, "rewards/rejected": -6.558526039123535, "step": 225 }, { "epoch": 0.4532298786871113, "grad_norm": 25.763440445028106, "learning_rate": 3.3280196989428263e-07, "logits/chosen": -0.28175559639930725, "logits/rejected": -0.5395274758338928, "logps/chosen": -408.2548828125, "logps/rejected": -689.34326171875, "loss": 0.3182, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0639290809631348, "rewards/margins": 2.8385584354400635, "rewards/rejected": -5.902487754821777, "step": 230 }, { "epoch": 0.46308270213683106, "grad_norm": 25.454226685247495, "learning_rate": 3.2462870275042367e-07, "logits/chosen": -0.3556864261627197, "logits/rejected": -0.5125774145126343, "logps/chosen": -393.869140625, "logps/rejected": -721.3883666992188, "loss": 0.2842, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.9227001667022705, "rewards/margins": 3.2367324829101562, "rewards/rejected": -6.159432888031006, "step": 235 }, { "epoch": 0.4729355255865509, "grad_norm": 17.441738861595127, "learning_rate": 3.1636688877701806e-07, "logits/chosen": -0.2837878167629242, "logits/rejected": -0.4791291654109955, "logps/chosen": -416.69989013671875, "logps/rejected": -730.2517700195312, "loss": 0.2738, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.118884563446045, "rewards/margins": 3.163923978805542, "rewards/rejected": -6.282808303833008, "step": 240 }, { "epoch": 0.4827883490362707, "grad_norm": 15.400566409989613, "learning_rate": 3.080263306023669e-07, "logits/chosen": -0.30386677384376526, "logits/rejected": -0.5350344777107239, "logps/chosen": -433.35760498046875, "logps/rejected": -742.5108642578125, "loss": 0.2983, "rewards/accuracies": 0.875, "rewards/chosen": -3.27720308303833, "rewards/margins": 3.1466774940490723, "rewards/rejected": -6.423880100250244, "step": 245 }, { "epoch": 0.49264117248599054, "grad_norm": 13.243347184925653, "learning_rate": 2.996169242846328e-07, "logits/chosen": -0.2507760524749756, "logits/rejected": -0.5307763814926147, "logps/chosen": -420.93218994140625, "logps/rejected": -727.508056640625, "loss": 0.2864, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.2063565254211426, "rewards/margins": 3.135401725769043, "rewards/rejected": -6.3417582511901855, "step": 250 }, { "epoch": 0.5024939959357103, "grad_norm": 15.905893971492043, "learning_rate": 2.911486475701835e-07, "logits/chosen": -0.3323180675506592, "logits/rejected": -0.500321090221405, "logps/chosen": -417.96533203125, "logps/rejected": -669.329345703125, "loss": 0.3038, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.0756640434265137, "rewards/margins": 2.5683434009552, "rewards/rejected": -5.644008159637451, "step": 255 }, { "epoch": 0.5123468193854301, "grad_norm": 15.678554296764496, "learning_rate": 2.826315480550129e-07, "logits/chosen": -0.2549038529396057, "logits/rejected": -0.5205506682395935, "logps/chosen": -397.9513244628906, "logps/rejected": -706.6505126953125, "loss": 0.2709, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.9337005615234375, "rewards/margins": 3.1359124183654785, "rewards/rejected": -6.069613456726074, "step": 260 }, { "epoch": 0.52219964283515, "grad_norm": 13.183965337617314, "learning_rate": 2.740757312632854e-07, "logits/chosen": -0.3502804636955261, "logits/rejected": -0.47828468680381775, "logps/chosen": -458.28009033203125, "logps/rejected": -756.2743530273438, "loss": 0.3101, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.547351837158203, "rewards/margins": 3.0040035247802734, "rewards/rejected": -6.551354885101318, "step": 265 }, { "epoch": 0.5320524662848698, "grad_norm": 14.251066041226494, "learning_rate": 2.654913486571487e-07, "logits/chosen": -0.29742032289505005, "logits/rejected": -0.512765109539032, "logps/chosen": -433.20281982421875, "logps/rejected": -742.1527099609375, "loss": 0.269, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.3142356872558594, "rewards/margins": 3.1091339588165283, "rewards/rejected": -6.423369407653809, "step": 270 }, { "epoch": 0.5419052897345896, "grad_norm": 15.970456495692506, "learning_rate": 2.5688858559204053e-07, "logits/chosen": -0.29328054189682007, "logits/rejected": -0.5653128027915955, "logps/chosen": -457.0283203125, "logps/rejected": -802.5032958984375, "loss": 0.2619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5228774547576904, "rewards/margins": 3.504716396331787, "rewards/rejected": -7.027594566345215, "step": 275 }, { "epoch": 0.5517581131843093, "grad_norm": 30.315717109626856, "learning_rate": 2.4827764923178246e-07, "logits/chosen": -0.2728544771671295, "logits/rejected": -0.5340480208396912, "logps/chosen": -508.76715087890625, "logps/rejected": -814.9278564453125, "loss": 0.3282, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.023861408233643, "rewards/margins": 3.080327272415161, "rewards/rejected": -7.104189395904541, "step": 280 }, { "epoch": 0.5616109366340292, "grad_norm": 12.181715177068432, "learning_rate": 2.3966875643779667e-07, "logits/chosen": -0.39192137122154236, "logits/rejected": -0.48601895570755005, "logps/chosen": -509.3650817871094, "logps/rejected": -919.6329956054688, "loss": 0.2728, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.043977737426758, "rewards/margins": 4.052145957946777, "rewards/rejected": -8.096123695373535, "step": 285 }, { "epoch": 0.571463760083749, "grad_norm": 22.18619533848792, "learning_rate": 2.3107212164681774e-07, "logits/chosen": -0.2627003788948059, "logits/rejected": -0.512112557888031, "logps/chosen": -514.5509033203125, "logps/rejected": -777.1829833984375, "loss": 0.286, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.147472381591797, "rewards/margins": 2.6902387142181396, "rewards/rejected": -6.837711334228516, "step": 290 }, { "epoch": 0.5813165835334688, "grad_norm": 20.397352654188396, "learning_rate": 2.2249794475148019e-07, "logits/chosen": -0.2886350750923157, "logits/rejected": -0.533146858215332, "logps/chosen": -465.264892578125, "logps/rejected": -807.4385986328125, "loss": 0.2676, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.6474337577819824, "rewards/margins": 3.4217605590820312, "rewards/rejected": -7.0691938400268555, "step": 295 }, { "epoch": 0.5911694069831886, "grad_norm": 12.525676548560202, "learning_rate": 2.1395639899816332e-07, "logits/chosen": -0.35506710410118103, "logits/rejected": -0.4460170865058899, "logps/chosen": -424.7347106933594, "logps/rejected": -713.3164672851562, "loss": 0.2296, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.2564120292663574, "rewards/margins": 2.842578411102295, "rewards/rejected": -6.098990440368652, "step": 300 }, { "epoch": 0.6010222304329085, "grad_norm": 16.20564210489265, "learning_rate": 2.0545761891645177e-07, "logits/chosen": -0.2978189289569855, "logits/rejected": -0.5122548937797546, "logps/chosen": -471.35693359375, "logps/rejected": -824.7840576171875, "loss": 0.2482, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.6659018993377686, "rewards/margins": 3.483975887298584, "rewards/rejected": -7.14987850189209, "step": 305 }, { "epoch": 0.6108750538826282, "grad_norm": 16.59602026766405, "learning_rate": 1.9701168829453305e-07, "logits/chosen": -0.2841472625732422, "logits/rejected": -0.49612703919410706, "logps/chosen": -475.79559326171875, "logps/rejected": -832.6185302734375, "loss": 0.2832, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.748015880584717, "rewards/margins": 3.5668914318084717, "rewards/rejected": -7.314908027648926, "step": 310 }, { "epoch": 0.620727877332348, "grad_norm": 14.834560333592078, "learning_rate": 1.886286282148002e-07, "logits/chosen": -0.2684577405452728, "logits/rejected": -0.5325924158096313, "logps/chosen": -491.4776306152344, "logps/rejected": -837.3449096679688, "loss": 0.2599, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.8538050651550293, "rewards/margins": 3.5408568382263184, "rewards/rejected": -7.394662380218506, "step": 315 }, { "epoch": 0.6305807007820678, "grad_norm": 11.160506827485138, "learning_rate": 1.8031838516385422e-07, "logits/chosen": -0.2939426898956299, "logits/rejected": -0.4827597141265869, "logps/chosen": -524.61865234375, "logps/rejected": -873.7683715820312, "loss": 0.294, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.23654842376709, "rewards/margins": 3.4801859855651855, "rewards/rejected": -7.716734409332275, "step": 320 }, { "epoch": 0.6404335242317877, "grad_norm": 25.16794488629472, "learning_rate": 1.7209081923101472e-07, "logits/chosen": -0.37951114773750305, "logits/rejected": -0.5442458391189575, "logps/chosen": -484.8412170410156, "logps/rejected": -843.31494140625, "loss": 0.2438, "rewards/accuracies": 0.84375, "rewards/chosen": -3.8335394859313965, "rewards/margins": 3.537074565887451, "rewards/rejected": -7.370614528656006, "step": 325 }, { "epoch": 0.6502863476815075, "grad_norm": 15.675332902369199, "learning_rate": 1.639556924093404e-07, "logits/chosen": -0.31534910202026367, "logits/rejected": -0.6055206656455994, "logps/chosen": -469.186767578125, "logps/rejected": -785.3204345703125, "loss": 0.2862, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.6316142082214355, "rewards/margins": 3.245419979095459, "rewards/rejected": -6.8770341873168945, "step": 330 }, { "epoch": 0.6601391711312273, "grad_norm": 12.550466940425805, "learning_rate": 1.5592265701304114e-07, "logits/chosen": -0.309912770986557, "logits/rejected": -0.580001950263977, "logps/chosen": -455.7469177246094, "logps/rejected": -795.7465209960938, "loss": 0.2543, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.550823211669922, "rewards/margins": 3.412391185760498, "rewards/rejected": -6.9632134437561035, "step": 335 }, { "epoch": 0.669991994580947, "grad_norm": 14.379670239758621, "learning_rate": 1.4800124422502334e-07, "logits/chosen": -0.3434782028198242, "logits/rejected": -0.5037192106246948, "logps/chosen": -427.98486328125, "logps/rejected": -760.0518798828125, "loss": 0.2582, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.309481143951416, "rewards/margins": 3.3087706565856934, "rewards/rejected": -6.618251800537109, "step": 340 }, { "epoch": 0.6798448180306669, "grad_norm": 16.948885121756778, "learning_rate": 1.4020085278815743e-07, "logits/chosen": -0.3305511176586151, "logits/rejected": -0.510036289691925, "logps/chosen": -473.0335998535156, "logps/rejected": -838.19970703125, "loss": 0.2595, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7052788734436035, "rewards/margins": 3.6906940937042236, "rewards/rejected": -7.39597225189209, "step": 345 }, { "epoch": 0.6896976414803867, "grad_norm": 13.672982554566653, "learning_rate": 1.3253073785368545e-07, "logits/chosen": -0.3359353244304657, "logits/rejected": -0.599565863609314, "logps/chosen": -450.7706604003906, "logps/rejected": -766.9132080078125, "loss": 0.2378, "rewards/accuracies": 0.875, "rewards/chosen": -3.4409427642822266, "rewards/margins": 3.247152328491211, "rewards/rejected": -6.6880950927734375, "step": 350 }, { "epoch": 0.6995504649301065, "grad_norm": 17.198800285063513, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.3707168996334076, "logits/rejected": -0.5359587669372559, "logps/chosen": -492.1787109375, "logps/rejected": -895.4451904296875, "loss": 0.266, "rewards/accuracies": 0.875, "rewards/chosen": -3.8479511737823486, "rewards/margins": 4.02894926071167, "rewards/rejected": -7.876900672912598, "step": 355 }, { "epoch": 0.7094032883798264, "grad_norm": 19.550687307980805, "learning_rate": 1.1761757443482285e-07, "logits/chosen": -0.2943348288536072, "logits/rejected": -0.6027869582176208, "logps/chosen": -481.8168029785156, "logps/rejected": -860.5421142578125, "loss": 0.2475, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.7917988300323486, "rewards/margins": 3.795625686645508, "rewards/rejected": -7.587424278259277, "step": 360 }, { "epoch": 0.7192561118295462, "grad_norm": 19.931316668144966, "learning_rate": 1.1039222039359644e-07, "logits/chosen": -0.32548245787620544, "logits/rejected": -0.5251120328903198, "logps/chosen": -466.42352294921875, "logps/rejected": -837.3107299804688, "loss": 0.2503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.647482395172119, "rewards/margins": 3.7225258350372314, "rewards/rejected": -7.370007514953613, "step": 365 }, { "epoch": 0.729108935279266, "grad_norm": 27.992034585729865, "learning_rate": 1.0333251074666608e-07, "logits/chosen": -0.26019373536109924, "logits/rejected": -0.5685330629348755, "logps/chosen": -508.7705078125, "logps/rejected": -834.8313598632812, "loss": 0.3009, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.032122611999512, "rewards/margins": 3.357773542404175, "rewards/rejected": -7.389896392822266, "step": 370 }, { "epoch": 0.7389617587289857, "grad_norm": 11.391526012410443, "learning_rate": 9.644682182758304e-08, "logits/chosen": -0.36888641119003296, "logits/rejected": -0.544438362121582, "logps/chosen": -501.13922119140625, "logps/rejected": -866.8273315429688, "loss": 0.267, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.018528938293457, "rewards/margins": 3.660531997680664, "rewards/rejected": -7.679060459136963, "step": 375 }, { "epoch": 0.7488145821787056, "grad_norm": 15.381753957706216, "learning_rate": 8.974332349459992e-08, "logits/chosen": -0.3592904210090637, "logits/rejected": -0.5230213403701782, "logps/chosen": -498.76397705078125, "logps/rejected": -873.2203369140625, "loss": 0.2913, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.937748432159424, "rewards/margins": 3.7432987689971924, "rewards/rejected": -7.681046962738037, "step": 380 }, { "epoch": 0.7586674056284254, "grad_norm": 12.706615805349994, "learning_rate": 8.322996943714672e-08, "logits/chosen": -0.3497922718524933, "logits/rejected": -0.5577541589736938, "logps/chosen": -512.2010498046875, "logps/rejected": -885.4027099609375, "loss": 0.2356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.10722017288208, "rewards/margins": 3.7454257011413574, "rewards/rejected": -7.8526458740234375, "step": 385 }, { "epoch": 0.7685202290781452, "grad_norm": 14.360275587084523, "learning_rate": 7.691448773879256e-08, "logits/chosen": -0.3780885934829712, "logits/rejected": -0.5432588458061218, "logps/chosen": -449.843505859375, "logps/rejected": -840.5940551757812, "loss": 0.2795, "rewards/accuracies": 0.875, "rewards/chosen": -3.502520799636841, "rewards/margins": 3.937730073928833, "rewards/rejected": -7.440250396728516, "step": 390 }, { "epoch": 0.778373052527865, "grad_norm": 15.407447239164402, "learning_rate": 7.080437170788722e-08, "logits/chosen": -0.38400599360466003, "logits/rejected": -0.5888391733169556, "logps/chosen": -466.9847717285156, "logps/rejected": -768.0460205078125, "loss": 0.2824, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.600980281829834, "rewards/margins": 3.0612690448760986, "rewards/rejected": -6.6622490882873535, "step": 395 }, { "epoch": 0.7882258759775849, "grad_norm": 11.960557567104859, "learning_rate": 6.490687098676332e-08, "logits/chosen": -0.4111432433128357, "logits/rejected": -0.6113660335540771, "logps/chosen": -448.69500732421875, "logps/rejected": -851.4993896484375, "loss": 0.2753, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.424553394317627, "rewards/margins": 4.026993751525879, "rewards/rejected": -7.451546669006348, "step": 400 }, { "epoch": 0.7882258759775849, "eval_logits/chosen": -1.0704303979873657, "eval_logits/rejected": -0.814034640789032, "eval_logps/chosen": -502.4184875488281, "eval_logps/rejected": -705.5203857421875, "eval_loss": 0.7013445496559143, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -4.094460487365723, "eval_rewards/margins": 1.768728494644165, "eval_rewards/rejected": -5.863188743591309, "eval_runtime": 197.0588, "eval_samples_per_second": 10.144, "eval_steps_per_second": 1.269, "step": 400 }, { "epoch": 0.7980786994273046, "grad_norm": 16.569681590429596, "learning_rate": 5.9228982950048414e-08, "logits/chosen": -0.3508976995944977, "logits/rejected": -0.6730154752731323, "logps/chosen": -446.61688232421875, "logps/rejected": -841.1275634765625, "loss": 0.2751, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3948426246643066, "rewards/margins": 4.001449108123779, "rewards/rejected": -7.396292209625244, "step": 405 }, { "epoch": 0.8079315228770244, "grad_norm": 30.74667877593499, "learning_rate": 5.3777444402291345e-08, "logits/chosen": -0.44255560636520386, "logits/rejected": -0.5289443135261536, "logps/chosen": -484.78363037109375, "logps/rejected": -847.515625, "loss": 0.2925, "rewards/accuracies": 0.8125, "rewards/chosen": -3.900998592376709, "rewards/margins": 3.6134960651397705, "rewards/rejected": -7.5144944190979, "step": 410 }, { "epoch": 0.8177843463267442, "grad_norm": 23.687222476192307, "learning_rate": 4.855872358475546e-08, "logits/chosen": -0.3782200217247009, "logits/rejected": -0.6156097650527954, "logps/chosen": -456.51220703125, "logps/rejected": -778.0684814453125, "loss": 0.266, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.4957103729248047, "rewards/margins": 3.254422664642334, "rewards/rejected": -6.750133514404297, "step": 415 }, { "epoch": 0.8276371697764641, "grad_norm": 19.649935116061837, "learning_rate": 4.357901250086107e-08, "logits/chosen": -0.42209166288375854, "logits/rejected": -0.5967798233032227, "logps/chosen": -440.5074157714844, "logps/rejected": -706.137451171875, "loss": 0.2765, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.3461577892303467, "rewards/margins": 2.7582011222839355, "rewards/rejected": -6.104359149932861, "step": 420 }, { "epoch": 0.8374899932261839, "grad_norm": 20.031137924167396, "learning_rate": 3.884421956938377e-08, "logits/chosen": -0.3795103430747986, "logits/rejected": -0.5693720579147339, "logps/chosen": -415.31976318359375, "logps/rejected": -812.3355712890625, "loss": 0.2851, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.1916797161102295, "rewards/margins": 3.95634126663208, "rewards/rejected": -7.1480207443237305, "step": 425 }, { "epoch": 0.8473428166759037, "grad_norm": 13.471904858040782, "learning_rate": 3.435996261412591e-08, "logits/chosen": -0.37054741382598877, "logits/rejected": -0.5658844113349915, "logps/chosen": -443.5189514160156, "logps/rejected": -789.6598510742188, "loss": 0.2174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.431079864501953, "rewards/margins": 3.494525909423828, "rewards/rejected": -6.925606727600098, "step": 430 }, { "epoch": 0.8571956401256235, "grad_norm": 41.43328285681965, "learning_rate": 3.013156219837776e-08, "logits/chosen": -0.42507949471473694, "logits/rejected": -0.6270584464073181, "logps/chosen": -437.37841796875, "logps/rejected": -766.0833129882812, "loss": 0.2904, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.347297191619873, "rewards/margins": 3.312955856323242, "rewards/rejected": -6.660252571105957, "step": 435 }, { "epoch": 0.8670484635753433, "grad_norm": 14.214226148262764, "learning_rate": 2.6164035312078447e-08, "logits/chosen": -0.35758644342422485, "logits/rejected": -0.5952532291412354, "logps/chosen": -442.4962463378906, "logps/rejected": -813.2142944335938, "loss": 0.2391, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3745460510253906, "rewards/margins": 3.7038657665252686, "rewards/rejected": -7.0784125328063965, "step": 440 }, { "epoch": 0.8769012870250631, "grad_norm": 12.722193098483684, "learning_rate": 2.2462089419165776e-08, "logits/chosen": -0.3699408173561096, "logits/rejected": -0.6010391116142273, "logps/chosen": -450.82745361328125, "logps/rejected": -841.6207885742188, "loss": 0.2671, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.4586853981018066, "rewards/margins": 3.931617259979248, "rewards/rejected": -7.3903021812438965, "step": 445 }, { "epoch": 0.8867541104747829, "grad_norm": 34.89251514032782, "learning_rate": 1.9030116872178314e-08, "logits/chosen": -0.42120829224586487, "logits/rejected": -0.5930547118186951, "logps/chosen": -482.32049560546875, "logps/rejected": -877.3797607421875, "loss": 0.2528, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.762233018875122, "rewards/margins": 3.9991488456726074, "rewards/rejected": -7.761382102966309, "step": 450 }, { "epoch": 0.8966069339245027, "grad_norm": 11.63474736568058, "learning_rate": 1.5872189700736337e-08, "logits/chosen": -0.3359231948852539, "logits/rejected": -0.580034613609314, "logps/chosen": -476.1517028808594, "logps/rejected": -840.4141845703125, "loss": 0.2568, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.7410130500793457, "rewards/margins": 3.678678512573242, "rewards/rejected": -7.419691562652588, "step": 455 }, { "epoch": 0.9064597573742226, "grad_norm": 17.200657346282192, "learning_rate": 1.2992054780085692e-08, "logits/chosen": -0.42435139417648315, "logits/rejected": -0.5707032084465027, "logps/chosen": -480.90673828125, "logps/rejected": -864.6921997070312, "loss": 0.2681, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.753577470779419, "rewards/margins": 3.8412506580352783, "rewards/rejected": -7.594828128814697, "step": 460 }, { "epoch": 0.9163125808239424, "grad_norm": 15.168319053661703, "learning_rate": 1.0393129385436823e-08, "logits/chosen": -0.41962409019470215, "logits/rejected": -0.5582268238067627, "logps/chosen": -480.22650146484375, "logps/rejected": -835.0930786132812, "loss": 0.2639, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.7359073162078857, "rewards/margins": 3.5815021991729736, "rewards/rejected": -7.317408561706543, "step": 465 }, { "epoch": 0.9261654042736621, "grad_norm": 14.145988270109239, "learning_rate": 8.078497137373242e-09, "logits/chosen": -0.3046739399433136, "logits/rejected": -0.6777099967002869, "logps/chosen": -452.22406005859375, "logps/rejected": -792.261474609375, "loss": 0.2466, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.451488494873047, "rewards/margins": 3.4896602630615234, "rewards/rejected": -6.941148281097412, "step": 470 }, { "epoch": 0.936018227723382, "grad_norm": 17.26395488122693, "learning_rate": 6.0509043431410945e-09, "logits/chosen": -0.47448891401290894, "logits/rejected": -0.5780371427536011, "logps/chosen": -473.33148193359375, "logps/rejected": -845.0810546875, "loss": 0.2735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.652672290802002, "rewards/margins": 3.7301712036132812, "rewards/rejected": -7.382843971252441, "step": 475 }, { "epoch": 0.9458710511731018, "grad_norm": 13.502447902766075, "learning_rate": 4.312756738160145e-09, "logits/chosen": -0.4307557940483093, "logits/rejected": -0.609841525554657, "logps/chosen": -455.236572265625, "logps/rejected": -856.6760864257812, "loss": 0.2483, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.540134906768799, "rewards/margins": 3.9815449714660645, "rewards/rejected": -7.521679878234863, "step": 480 }, { "epoch": 0.9557238746228216, "grad_norm": 14.00663655974014, "learning_rate": 2.8661166316229223e-09, "logits/chosen": -0.34750238060951233, "logits/rejected": -0.5667217969894409, "logps/chosen": -460.013671875, "logps/rejected": -754.5597534179688, "loss": 0.2729, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.5511062145233154, "rewards/margins": 2.9699506759643555, "rewards/rejected": -6.52105712890625, "step": 485 }, { "epoch": 0.9655766980725414, "grad_norm": 14.399934375372629, "learning_rate": 1.7127004595681727e-09, "logits/chosen": -0.37273770570755005, "logits/rejected": -0.6115278005599976, "logps/chosen": -470.6495056152344, "logps/rejected": -818.9450073242188, "loss": 0.2724, "rewards/accuracies": 0.90625, "rewards/chosen": -3.666602611541748, "rewards/margins": 3.5529751777648926, "rewards/rejected": -7.219576835632324, "step": 490 }, { "epoch": 0.9754295215222613, "grad_norm": 13.760376876126982, "learning_rate": 8.538767483325383e-10, "logits/chosen": -0.3895708918571472, "logits/rejected": -0.6080259084701538, "logps/chosen": -463.2498474121094, "logps/rejected": -805.6424560546875, "loss": 0.2489, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.570629119873047, "rewards/margins": 3.4926178455352783, "rewards/rejected": -7.063246726989746, "step": 495 }, { "epoch": 0.9852823449719811, "grad_norm": 23.653347787632313, "learning_rate": 2.9066449079634404e-10, "logits/chosen": -0.37257179617881775, "logits/rejected": -0.5408270955085754, "logps/chosen": -501.0938415527344, "logps/rejected": -795.1988525390625, "loss": 0.2689, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.9710209369659424, "rewards/margins": 2.9601473808288574, "rewards/rejected": -6.931168556213379, "step": 500 }, { "epoch": 0.9951351684217008, "grad_norm": 14.616427588956885, "learning_rate": 2.3731937350224273e-11, "logits/chosen": -0.4466114044189453, "logits/rejected": -0.6212998032569885, "logps/chosen": -427.31536865234375, "logps/rejected": -899.2130126953125, "loss": 0.268, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.2273106575012207, "rewards/margins": 4.719491481781006, "rewards/rejected": -7.946801662445068, "step": 505 }, { "epoch": 0.9990762978015888, "step": 507, "total_flos": 0.0, "train_loss": 0.3691282767280789, "train_runtime": 28319.5283, "train_samples_per_second": 2.294, "train_steps_per_second": 0.018 } ], "logging_steps": 5, "max_steps": 507, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }