{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 6.162224436844012, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.07354654371738434, "logits/rejected": 0.1361573040485382, "logps/chosen": -1.7158677577972412, "logps/rejected": -1.8894357681274414, "loss": 1.1358, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7158677577972412, "rewards/margins": 0.17356786131858826, "rewards/rejected": -1.8894357681274414, "sft_loss": 1.4684427976608276, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 10.941948565599809, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.0025766133330762386, "logits/rejected": 0.11985313892364502, "logps/chosen": -1.8013126850128174, "logps/rejected": -1.8446658849716187, "loss": 1.2299, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8013126850128174, "rewards/margins": 0.04335314407944679, "rewards/rejected": -1.8446658849716187, "sft_loss": 1.5082662105560303, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 13.573973853756051, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.045625053346157074, "logits/rejected": 0.053189463913440704, "logps/chosen": -1.6336523294448853, "logps/rejected": -1.7631381750106812, "loss": 1.2018, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6336523294448853, "rewards/margins": 0.1294858753681183, "rewards/rejected": -1.7631381750106812, "sft_loss": 1.4996378421783447, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 5.965322949752156, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.03252996876835823, "logits/rejected": 0.056474365293979645, "logps/chosen": -1.7247947454452515, "logps/rejected": -1.8047034740447998, "loss": 1.2262, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7247947454452515, "rewards/margins": 0.0799088105559349, "rewards/rejected": -1.8047034740447998, "sft_loss": 1.5000253915786743, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 17.583923004034727, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.06310281157493591, "logits/rejected": 0.024211319163441658, "logps/chosen": -1.8702064752578735, "logps/rejected": -1.7776330709457397, "loss": 1.3473, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8702064752578735, "rewards/margins": -0.09257296472787857, "rewards/rejected": -1.7776330709457397, "sft_loss": 1.5455201864242554, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 12.928955930161317, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09293361008167267, "logits/rejected": 0.002235558582469821, "logps/chosen": -1.9093812704086304, "logps/rejected": -1.8332099914550781, "loss": 1.3291, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.9093812704086304, "rewards/margins": -0.07617148756980896, "rewards/rejected": -1.8332099914550781, "sft_loss": 1.6468474864959717, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 11.844224730796958, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.04166014865040779, "logits/rejected": 0.12396843731403351, "logps/chosen": -1.8452978134155273, "logps/rejected": -1.9958372116088867, "loss": 1.2679, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8452978134155273, "rewards/margins": 0.15053938329219818, "rewards/rejected": -1.9958372116088867, "sft_loss": 1.5612391233444214, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 11.104553717008118, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.04377365857362747, "logits/rejected": 0.22366110980510712, "logps/chosen": -1.880755066871643, "logps/rejected": -1.7429109811782837, "loss": 1.3102, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.880755066871643, "rewards/margins": -0.13784421980381012, "rewards/rejected": -1.7429109811782837, "sft_loss": 1.519200325012207, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 16.41709353765331, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.012500310316681862, "logits/rejected": 0.21139463782310486, "logps/chosen": -1.8342450857162476, "logps/rejected": -1.8698947429656982, "loss": 1.2791, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.8342450857162476, "rewards/margins": 0.03564963862299919, "rewards/rejected": -1.8698947429656982, "sft_loss": 1.5351488590240479, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 13.19811950294341, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.06285648047924042, "logits/rejected": 0.09227000921964645, "logps/chosen": -1.8954331874847412, "logps/rejected": -1.7754628658294678, "loss": 1.3432, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8954331874847412, "rewards/margins": -0.11997010558843613, "rewards/rejected": -1.7754628658294678, "sft_loss": 1.5819414854049683, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 9.01823552109749, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.10592956840991974, "logits/rejected": 0.1197512298822403, "logps/chosen": -1.8273578882217407, "logps/rejected": -1.8612210750579834, "loss": 1.3116, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8273578882217407, "rewards/margins": 0.03386329859495163, "rewards/rejected": -1.8612210750579834, "sft_loss": 1.5808497667312622, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 8.760806112736027, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.07141174376010895, "logits/rejected": 0.12480834871530533, "logps/chosen": -1.7802963256835938, "logps/rejected": -1.8852354288101196, "loss": 1.2193, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7802963256835938, "rewards/margins": 0.1049388200044632, "rewards/rejected": -1.8852354288101196, "sft_loss": 1.5415582656860352, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 7.306903729984867, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.03792408108711243, "logits/rejected": 0.11073604971170425, "logps/chosen": -1.631109595298767, "logps/rejected": -1.7599260807037354, "loss": 1.1604, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.631109595298767, "rewards/margins": 0.12881648540496826, "rewards/rejected": -1.7599260807037354, "sft_loss": 1.4718269109725952, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 14.757953721252662, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06770111620426178, "logits/rejected": 0.08704431354999542, "logps/chosen": -1.759752869606018, "logps/rejected": -1.8056182861328125, "loss": 1.3103, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.759752869606018, "rewards/margins": 0.045865464955568314, "rewards/rejected": -1.8056182861328125, "sft_loss": 1.6259733438491821, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 14.591213515171626, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.03633163869380951, "logits/rejected": 0.15258662402629852, "logps/chosen": -1.7644485235214233, "logps/rejected": -2.022709369659424, "loss": 1.1785, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7644485235214233, "rewards/margins": 0.258261114358902, "rewards/rejected": -2.022709369659424, "sft_loss": 1.5605233907699585, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 9.952040314815008, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.004873444326221943, "logits/rejected": 0.1104719489812851, "logps/chosen": -1.6954625844955444, "logps/rejected": -1.7293781042099, "loss": 1.2311, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6954625844955444, "rewards/margins": 0.03391539677977562, "rewards/rejected": -1.7293781042099, "sft_loss": 1.5150405168533325, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 5.982856438454467, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.15907660126686096, "logits/rejected": 0.0876961499452591, "logps/chosen": -1.761853814125061, "logps/rejected": -1.9326856136322021, "loss": 1.1865, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.761853814125061, "rewards/margins": 0.17083203792572021, "rewards/rejected": -1.9326856136322021, "sft_loss": 1.4846560955047607, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 16.704401798380253, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08973591774702072, "logits/rejected": 0.05205491930246353, "logps/chosen": -1.712908387184143, "logps/rejected": -1.7490822076797485, "loss": 1.2372, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.712908387184143, "rewards/margins": 0.03617396205663681, "rewards/rejected": -1.7490822076797485, "sft_loss": 1.4473885297775269, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 9.048528442375913, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.0784987211227417, "logits/rejected": 0.07455357909202576, "logps/chosen": -1.7556579113006592, "logps/rejected": -1.8748867511749268, "loss": 1.216, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7556579113006592, "rewards/margins": 0.11922872066497803, "rewards/rejected": -1.8748867511749268, "sft_loss": 1.5081722736358643, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 5.527964399235599, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.0294797420501709, "logits/rejected": 0.03756122291088104, "logps/chosen": -1.661158561706543, "logps/rejected": -1.766716718673706, "loss": 1.1814, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.661158561706543, "rewards/margins": 0.10555823147296906, "rewards/rejected": -1.766716718673706, "sft_loss": 1.478492259979248, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 11.601096875377186, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.046459540724754333, "logits/rejected": 0.07478093355894089, "logps/chosen": -1.6010570526123047, "logps/rejected": -1.7681080102920532, "loss": 1.139, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6010570526123047, "rewards/margins": 0.16705112159252167, "rewards/rejected": -1.7681080102920532, "sft_loss": 1.4180337190628052, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 8.400580715452332, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.004502465482801199, "logits/rejected": 0.10074075311422348, "logps/chosen": -1.5968029499053955, "logps/rejected": -1.654841661453247, "loss": 1.1857, "rewards/accuracies": 0.5, "rewards/chosen": -1.5968029499053955, "rewards/margins": 0.058038532733917236, "rewards/rejected": -1.654841661453247, "sft_loss": 1.4301693439483643, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 12.483377529373273, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.01744399033486843, "logits/rejected": 0.22708511352539062, "logps/chosen": -1.5779647827148438, "logps/rejected": -1.8342781066894531, "loss": 1.1262, "rewards/accuracies": 0.625, "rewards/chosen": -1.5779647827148438, "rewards/margins": 0.2563134729862213, "rewards/rejected": -1.8342781066894531, "sft_loss": 1.518568515777588, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 8.546077002576027, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.11730382591485977, "logits/rejected": 0.050408393144607544, "logps/chosen": -1.6292957067489624, "logps/rejected": -1.7397758960723877, "loss": 1.1787, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6292957067489624, "rewards/margins": 0.11048026382923126, "rewards/rejected": -1.7397758960723877, "sft_loss": 1.5028191804885864, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 5.241431853062498, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.11590234935283661, "logits/rejected": 0.013391993939876556, "logps/chosen": -1.569298505783081, "logps/rejected": -1.5349996089935303, "loss": 1.224, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.569298505783081, "rewards/margins": -0.03429893031716347, "rewards/rejected": -1.5349996089935303, "sft_loss": 1.4846540689468384, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 9.842421905850513, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.01973932608962059, "logits/rejected": 0.15391430258750916, "logps/chosen": -1.6046987771987915, "logps/rejected": -1.721130132675171, "loss": 1.1556, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6046987771987915, "rewards/margins": 0.11643137037754059, "rewards/rejected": -1.721130132675171, "sft_loss": 1.540232539176941, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 17.94063478027767, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.06300957500934601, "logits/rejected": 0.05799748748540878, "logps/chosen": -1.6499484777450562, "logps/rejected": -1.6832249164581299, "loss": 1.2084, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6499484777450562, "rewards/margins": 0.033276233822107315, "rewards/rejected": -1.6832249164581299, "sft_loss": 1.4744528532028198, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 8.654963108118846, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.0444134920835495, "logits/rejected": 0.12819430232048035, "logps/chosen": -1.613511323928833, "logps/rejected": -1.7291886806488037, "loss": 1.1738, "rewards/accuracies": 0.5, "rewards/chosen": -1.613511323928833, "rewards/margins": 0.11567743122577667, "rewards/rejected": -1.7291886806488037, "sft_loss": 1.522329330444336, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 9.908182221301256, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.029214048758149147, "logits/rejected": 0.12418278306722641, "logps/chosen": -1.5189011096954346, "logps/rejected": -1.6256484985351562, "loss": 1.1541, "rewards/accuracies": 0.46875, "rewards/chosen": -1.5189011096954346, "rewards/margins": 0.10674738883972168, "rewards/rejected": -1.6256484985351562, "sft_loss": 1.4731454849243164, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 13.151478113506652, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.07878760993480682, "logits/rejected": 0.08296145498752594, "logps/chosen": -1.4598569869995117, "logps/rejected": -1.4593470096588135, "loss": 1.1583, "rewards/accuracies": 0.5, "rewards/chosen": -1.4598569869995117, "rewards/margins": -0.0005100608104839921, "rewards/rejected": -1.4593470096588135, "sft_loss": 1.329883337020874, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 9.598508318421777, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.10372467339038849, "logits/rejected": -0.05565663054585457, "logps/chosen": -1.4395086765289307, "logps/rejected": -1.5460284948349, "loss": 1.1221, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4395086765289307, "rewards/margins": 0.10651954263448715, "rewards/rejected": -1.5460284948349, "sft_loss": 1.3945090770721436, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 10.148738813875053, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.1604335755109787, "logits/rejected": -0.01720331236720085, "logps/chosen": -1.5059787034988403, "logps/rejected": -1.4849656820297241, "loss": 1.2026, "rewards/accuracies": 0.46875, "rewards/chosen": -1.5059787034988403, "rewards/margins": -0.021012943238019943, "rewards/rejected": -1.4849656820297241, "sft_loss": 1.424424171447754, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 8.444078924667641, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.09711726009845734, "logits/rejected": 0.07096539437770844, "logps/chosen": -1.3629688024520874, "logps/rejected": -1.449608564376831, "loss": 1.1205, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3629688024520874, "rewards/margins": 0.0866396352648735, "rewards/rejected": -1.449608564376831, "sft_loss": 1.3194470405578613, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 13.236412367686684, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.145632803440094, "logits/rejected": -0.09582562744617462, "logps/chosen": -1.4657261371612549, "logps/rejected": -1.5351206064224243, "loss": 1.1642, "rewards/accuracies": 0.5, "rewards/chosen": -1.4657261371612549, "rewards/margins": 0.06939435005187988, "rewards/rejected": -1.5351206064224243, "sft_loss": 1.438075304031372, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 7.787774859149804, "learning_rate": 3.1194295900178254e-07, "logits/chosen": -0.02539270557463169, "logits/rejected": -0.026614580303430557, "logps/chosen": -1.3474395275115967, "logps/rejected": -1.4411218166351318, "loss": 1.1216, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3474395275115967, "rewards/margins": 0.09368231147527695, "rewards/rejected": -1.4411218166351318, "sft_loss": 1.3676502704620361, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 6.812116118866956, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.06027358025312424, "logits/rejected": -0.05768832564353943, "logps/chosen": -1.3618038892745972, "logps/rejected": -1.560143232345581, "loss": 1.1173, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3618038892745972, "rewards/margins": 0.1983393430709839, "rewards/rejected": -1.560143232345581, "sft_loss": 1.3774890899658203, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 7.994529020372144, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.20169305801391602, "logits/rejected": -0.11706896126270294, "logps/chosen": -1.3508808612823486, "logps/rejected": -1.39878249168396, "loss": 1.1426, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3508808612823486, "rewards/margins": 0.04790160059928894, "rewards/rejected": -1.39878249168396, "sft_loss": 1.3566535711288452, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 7.132584817326348, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.1200329065322876, "logits/rejected": -0.011905002407729626, "logps/chosen": -1.2903319597244263, "logps/rejected": -1.417377233505249, "loss": 1.0801, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2903319597244263, "rewards/margins": 0.12704533338546753, "rewards/rejected": -1.417377233505249, "sft_loss": 1.3000907897949219, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 5.467117063832052, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.04079444706439972, "logits/rejected": 0.10692572593688965, "logps/chosen": -1.2521560192108154, "logps/rejected": -1.4163955450057983, "loss": 1.0558, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2521560192108154, "rewards/margins": 0.16423942148685455, "rewards/rejected": -1.4163955450057983, "sft_loss": 1.2910324335098267, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 16.412975138686853, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.1568303108215332, "logits/rejected": -0.029046082869172096, "logps/chosen": -1.3794472217559814, "logps/rejected": -1.4154322147369385, "loss": 1.1498, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3794472217559814, "rewards/margins": 0.03598495572805405, "rewards/rejected": -1.4154322147369385, "sft_loss": 1.3919644355773926, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 11.53610009270648, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.094792440533638, "logits/rejected": 0.04317759722471237, "logps/chosen": -1.2949237823486328, "logps/rejected": -1.359531044960022, "loss": 1.1074, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2949237823486328, "rewards/margins": 0.0646071583032608, "rewards/rejected": -1.359531044960022, "sft_loss": 1.2842977046966553, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 7.887283586640461, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.19363883137702942, "logits/rejected": -0.016594117507338524, "logps/chosen": -1.3659045696258545, "logps/rejected": -1.4803597927093506, "loss": 1.1136, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3659045696258545, "rewards/margins": 0.11445526778697968, "rewards/rejected": -1.4803597927093506, "sft_loss": 1.338024377822876, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 6.085333351790094, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.2194538116455078, "logits/rejected": 0.015209652483463287, "logps/chosen": -1.3826278448104858, "logps/rejected": -1.4374979734420776, "loss": 1.1212, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3826278448104858, "rewards/margins": 0.054870136082172394, "rewards/rejected": -1.4374979734420776, "sft_loss": 1.3614327907562256, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 12.841488245600196, "learning_rate": 3.92156862745098e-07, "logits/chosen": -0.021501736715435982, "logits/rejected": 0.066695936024189, "logps/chosen": -1.3111062049865723, "logps/rejected": -1.4572151899337769, "loss": 1.0912, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3111062049865723, "rewards/margins": 0.14610889554023743, "rewards/rejected": -1.4572151899337769, "sft_loss": 1.3320391178131104, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 5.5322486055819775, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.13724544644355774, "logits/rejected": 0.023908359929919243, "logps/chosen": -1.3111350536346436, "logps/rejected": -1.4400126934051514, "loss": 1.0738, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3111350536346436, "rewards/margins": 0.1288774460554123, "rewards/rejected": -1.4400126934051514, "sft_loss": 1.310573935508728, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 5.527098274388295, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.03876190260052681, "logits/rejected": 0.03524729236960411, "logps/chosen": -1.3190863132476807, "logps/rejected": -1.4766333103179932, "loss": 1.0667, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3190863132476807, "rewards/margins": 0.15754687786102295, "rewards/rejected": -1.4766333103179932, "sft_loss": 1.2724026441574097, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 7.683355642835208, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.027026275172829628, "logits/rejected": 0.10092300176620483, "logps/chosen": -1.2846142053604126, "logps/rejected": -1.4518486261367798, "loss": 1.0556, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2846142053604126, "rewards/margins": 0.16723443567752838, "rewards/rejected": -1.4518486261367798, "sft_loss": 1.2829954624176025, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 4.770573773954613, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.06706438958644867, "logits/rejected": 0.053351886570453644, "logps/chosen": -1.3005207777023315, "logps/rejected": -1.4855378866195679, "loss": 1.0783, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3005207777023315, "rewards/margins": 0.18501710891723633, "rewards/rejected": -1.4855378866195679, "sft_loss": 1.3445043563842773, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 6.871730260983015, "learning_rate": 4.3672014260249554e-07, "logits/chosen": -0.004986020736396313, "logits/rejected": 0.10724584758281708, "logps/chosen": -1.418555498123169, "logps/rejected": -1.4448583126068115, "loss": 1.1693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.418555498123169, "rewards/margins": 0.026302779093384743, "rewards/rejected": -1.4448583126068115, "sft_loss": 1.4301879405975342, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 10.203244665026718, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.10470624268054962, "logits/rejected": 0.051918040961027145, "logps/chosen": -1.3056375980377197, "logps/rejected": -1.3619807958602905, "loss": 1.1313, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3056375980377197, "rewards/margins": 0.05634317919611931, "rewards/rejected": -1.3619807958602905, "sft_loss": 1.3133130073547363, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 6.790912025671572, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.055276162922382355, "logits/rejected": 0.0787787213921547, "logps/chosen": -1.2640888690948486, "logps/rejected": -1.3716880083084106, "loss": 1.0607, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2640888690948486, "rewards/margins": 0.1075989231467247, "rewards/rejected": -1.3716880083084106, "sft_loss": 1.2419153451919556, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 5.636934333298813, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.2526181638240814, "logits/rejected": -0.15046949684619904, "logps/chosen": -1.3521802425384521, "logps/rejected": -1.507084846496582, "loss": 1.0814, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3521802425384521, "rewards/margins": 0.1549045741558075, "rewards/rejected": -1.507084846496582, "sft_loss": 1.3677871227264404, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 7.491534287261313, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.13241180777549744, "logits/rejected": -0.05078262835741043, "logps/chosen": -1.3444370031356812, "logps/rejected": -1.508912444114685, "loss": 1.1041, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3444370031356812, "rewards/margins": 0.16447539627552032, "rewards/rejected": -1.508912444114685, "sft_loss": 1.3960318565368652, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 4.784128370751162, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.12042725086212158, "logits/rejected": 0.003054526401683688, "logps/chosen": -1.316843032836914, "logps/rejected": -1.417441964149475, "loss": 1.0933, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.316843032836914, "rewards/margins": 0.10059895366430283, "rewards/rejected": -1.417441964149475, "sft_loss": 1.3325092792510986, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 5.799474251053073, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.07320351898670197, "logits/rejected": 0.021167168393731117, "logps/chosen": -1.2687400579452515, "logps/rejected": -1.4257270097732544, "loss": 1.0499, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2687400579452515, "rewards/margins": 0.15698681771755219, "rewards/rejected": -1.4257270097732544, "sft_loss": 1.2448979616165161, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 7.0387417725027275, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.13529178500175476, "logits/rejected": 0.012803696095943451, "logps/chosen": -1.3235130310058594, "logps/rejected": -1.4172152280807495, "loss": 1.0932, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3235130310058594, "rewards/margins": 0.09370215237140656, "rewards/rejected": -1.4172152280807495, "sft_loss": 1.3104783296585083, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.581168345074518, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.12943314015865326, "logits/rejected": -0.0024739429354667664, "logps/chosen": -1.345080852508545, "logps/rejected": -1.4286965131759644, "loss": 1.1339, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.345080852508545, "rewards/margins": 0.0836157277226448, "rewards/rejected": -1.4286965131759644, "sft_loss": 1.3899028301239014, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 6.873481271303318, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.1648433953523636, "logits/rejected": 0.11345580965280533, "logps/chosen": -1.3644860982894897, "logps/rejected": -1.484805703163147, "loss": 1.0947, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3644860982894897, "rewards/margins": 0.1203194409608841, "rewards/rejected": -1.484805703163147, "sft_loss": 1.357634425163269, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 7.067404946544144, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.10759624093770981, "logits/rejected": -0.05163930729031563, "logps/chosen": -1.26571524143219, "logps/rejected": -1.4039279222488403, "loss": 1.0593, "rewards/accuracies": 0.5625, "rewards/chosen": -1.26571524143219, "rewards/margins": 0.13821277022361755, "rewards/rejected": -1.4039279222488403, "sft_loss": 1.271865963935852, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 6.4408716649587525, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.113519586622715, "logits/rejected": 0.045286424458026886, "logps/chosen": -1.308065414428711, "logps/rejected": -1.3825656175613403, "loss": 1.1161, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.308065414428711, "rewards/margins": 0.0745001882314682, "rewards/rejected": -1.3825656175613403, "sft_loss": 1.3652435541152954, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 5.194545703203444, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.07074855268001556, "logits/rejected": -0.0032803595531731844, "logps/chosen": -1.4063204526901245, "logps/rejected": -1.417278528213501, "loss": 1.1752, "rewards/accuracies": 0.5, "rewards/chosen": -1.4063204526901245, "rewards/margins": 0.010958048515021801, "rewards/rejected": -1.417278528213501, "sft_loss": 1.4083219766616821, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 7.158406715301559, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.2518317699432373, "logits/rejected": -0.16715845465660095, "logps/chosen": -1.3734468221664429, "logps/rejected": -1.459455132484436, "loss": 1.1388, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3734468221664429, "rewards/margins": 0.08600833266973495, "rewards/rejected": -1.459455132484436, "sft_loss": 1.3688867092132568, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 7.47312214042103, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.04104981943964958, "logits/rejected": 0.11383312940597534, "logps/chosen": -1.3609014749526978, "logps/rejected": -1.509178876876831, "loss": 1.1223, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3609014749526978, "rewards/margins": 0.1482773870229721, "rewards/rejected": -1.509178876876831, "sft_loss": 1.3762632608413696, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 5.142394838688744, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.09145097434520721, "logits/rejected": 0.036288149654865265, "logps/chosen": -1.3178232908248901, "logps/rejected": -1.370639443397522, "loss": 1.1185, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3178232908248901, "rewards/margins": 0.05281621962785721, "rewards/rejected": -1.370639443397522, "sft_loss": 1.3386495113372803, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 6.1215749137539035, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.16615521907806396, "logits/rejected": -0.05705835670232773, "logps/chosen": -1.320780634880066, "logps/rejected": -1.5688358545303345, "loss": 1.0807, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.320780634880066, "rewards/margins": 0.24805521965026855, "rewards/rejected": -1.5688358545303345, "sft_loss": 1.4028794765472412, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 9.554114693660987, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.06171814724802971, "logits/rejected": 0.0778273418545723, "logps/chosen": -1.3327248096466064, "logps/rejected": -1.511650800704956, "loss": 1.081, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3327248096466064, "rewards/margins": 0.17892588675022125, "rewards/rejected": -1.511650800704956, "sft_loss": 1.3499819040298462, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 8.691211468546095, "learning_rate": 5.971479500891266e-07, "logits/chosen": -0.010685861110687256, "logits/rejected": 0.0859212800860405, "logps/chosen": -1.3480260372161865, "logps/rejected": -1.3847862482070923, "loss": 1.1218, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3480260372161865, "rewards/margins": 0.036760084331035614, "rewards/rejected": -1.3847862482070923, "sft_loss": 1.3458187580108643, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 8.682402624050773, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.08280332386493683, "logits/rejected": 0.05182039737701416, "logps/chosen": -1.4081138372421265, "logps/rejected": -1.4955675601959229, "loss": 1.1435, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4081138372421265, "rewards/margins": 0.08745387196540833, "rewards/rejected": -1.4955675601959229, "sft_loss": 1.395704746246338, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 10.09059167148764, "learning_rate": 6.149732620320855e-07, "logits/chosen": -0.0009210974094457924, "logits/rejected": 0.026766661554574966, "logps/chosen": -1.3219306468963623, "logps/rejected": -1.4634308815002441, "loss": 1.0916, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3219306468963623, "rewards/margins": 0.14150023460388184, "rewards/rejected": -1.4634308815002441, "sft_loss": 1.3607900142669678, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 7.445334085344142, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.010182015597820282, "logits/rejected": 0.08374631404876709, "logps/chosen": -1.2952816486358643, "logps/rejected": -1.414016604423523, "loss": 1.1085, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2952816486358643, "rewards/margins": 0.11873485893011093, "rewards/rejected": -1.414016604423523, "sft_loss": 1.3489550352096558, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 6.852093118669004, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.12995010614395142, "logits/rejected": 0.08240822702646255, "logps/chosen": -1.396977186203003, "logps/rejected": -1.427973747253418, "loss": 1.1597, "rewards/accuracies": 0.46875, "rewards/chosen": -1.396977186203003, "rewards/margins": 0.03099655732512474, "rewards/rejected": -1.427973747253418, "sft_loss": 1.405177354812622, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 6.5468973260672305, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.10669644922018051, "logits/rejected": -0.03266788646578789, "logps/chosen": -1.3259212970733643, "logps/rejected": -1.4534105062484741, "loss": 1.0918, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3259212970733643, "rewards/margins": 0.12748919427394867, "rewards/rejected": -1.4534105062484741, "sft_loss": 1.3055691719055176, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 7.65512352583562, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.0375606045126915, "logits/rejected": 0.038245074450969696, "logps/chosen": -1.3043148517608643, "logps/rejected": -1.4049714803695679, "loss": 1.092, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3043148517608643, "rewards/margins": 0.10065661370754242, "rewards/rejected": -1.4049714803695679, "sft_loss": 1.2806155681610107, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 6.490861236478968, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.06274469941854477, "logits/rejected": 0.02214394509792328, "logps/chosen": -1.2985416650772095, "logps/rejected": -1.3463687896728516, "loss": 1.102, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2985416650772095, "rewards/margins": 0.0478271022439003, "rewards/rejected": -1.3463687896728516, "sft_loss": 1.2697174549102783, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 6.908038561608901, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.1052565723657608, "logits/rejected": 0.04586447775363922, "logps/chosen": -1.2791210412979126, "logps/rejected": -1.426588535308838, "loss": 1.0847, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2791210412979126, "rewards/margins": 0.1474677473306656, "rewards/rejected": -1.426588535308838, "sft_loss": 1.3325226306915283, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 6.034487961079161, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.06363788992166519, "logits/rejected": 0.01771625317633152, "logps/chosen": -1.2925516366958618, "logps/rejected": -1.4579870700836182, "loss": 1.0643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2925516366958618, "rewards/margins": 0.16543535888195038, "rewards/rejected": -1.4579870700836182, "sft_loss": 1.303420066833496, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 4.691163180630138, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.007407332770526409, "logits/rejected": 0.07189084589481354, "logps/chosen": -1.391524314880371, "logps/rejected": -1.3882184028625488, "loss": 1.1825, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.391524314880371, "rewards/margins": -0.0033058510161936283, "rewards/rejected": -1.3882184028625488, "sft_loss": 1.4002737998962402, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 8.263656354996776, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.04927052557468414, "logits/rejected": 0.21582689881324768, "logps/chosen": -1.387182593345642, "logps/rejected": -1.4510324001312256, "loss": 1.1495, "rewards/accuracies": 0.46875, "rewards/chosen": -1.387182593345642, "rewards/margins": 0.06384972482919693, "rewards/rejected": -1.4510324001312256, "sft_loss": 1.3905909061431885, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 6.478916151612064, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.07751376926898956, "logits/rejected": 0.0774260088801384, "logps/chosen": -1.335992693901062, "logps/rejected": -1.3548707962036133, "loss": 1.1256, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.335992693901062, "rewards/margins": 0.01887820102274418, "rewards/rejected": -1.3548707962036133, "sft_loss": 1.342871904373169, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 5.185324220042295, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.06050665304064751, "logits/rejected": 0.153707355260849, "logps/chosen": -1.3152250051498413, "logps/rejected": -1.4181197881698608, "loss": 1.09, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3152250051498413, "rewards/margins": 0.10289473831653595, "rewards/rejected": -1.4181197881698608, "sft_loss": 1.3137023448944092, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.23501165211200714, "eval_logits/rejected": 0.32066836953163147, "eval_logps/chosen": -1.3477087020874023, "eval_logps/rejected": -1.4854915142059326, "eval_loss": 1.1010371446609497, "eval_rewards/accuracies": 0.5586053133010864, "eval_rewards/chosen": -1.3477087020874023, "eval_rewards/margins": 0.137783020734787, "eval_rewards/rejected": -1.4854915142059326, "eval_runtime": 44.0326, "eval_samples_per_second": 30.546, "eval_sft_loss": 1.3681285381317139, "eval_steps_per_second": 7.653, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 7.899141152347657, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.014751395210623741, "logits/rejected": 0.08054462820291519, "logps/chosen": -1.3261982202529907, "logps/rejected": -1.3998126983642578, "loss": 1.1073, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3261982202529907, "rewards/margins": 0.0736144408583641, "rewards/rejected": -1.3998126983642578, "sft_loss": 1.3163540363311768, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 6.426580024778647, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.010929781012237072, "logits/rejected": 0.1424231231212616, "logps/chosen": -1.2975648641586304, "logps/rejected": -1.3927663564682007, "loss": 1.1032, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2975648641586304, "rewards/margins": 0.09520147740840912, "rewards/rejected": -1.3927663564682007, "sft_loss": 1.332929015159607, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 5.321301590410997, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.05153341218829155, "logits/rejected": -0.017596019431948662, "logps/chosen": -1.293229341506958, "logps/rejected": -1.460153341293335, "loss": 1.0731, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.293229341506958, "rewards/margins": 0.16692404448986053, "rewards/rejected": -1.460153341293335, "sft_loss": 1.3046073913574219, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 6.338567776813103, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.022293150424957275, "logits/rejected": 0.16873207688331604, "logps/chosen": -1.2768056392669678, "logps/rejected": -1.3799188137054443, "loss": 1.1008, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2768056392669678, "rewards/margins": 0.10311311483383179, "rewards/rejected": -1.3799188137054443, "sft_loss": 1.3268121480941772, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 6.0883866845079675, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.07399999350309372, "logits/rejected": 0.12590864300727844, "logps/chosen": -1.3194478750228882, "logps/rejected": -1.4870588779449463, "loss": 1.0876, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3194478750228882, "rewards/margins": 0.1676110327243805, "rewards/rejected": -1.4870588779449463, "sft_loss": 1.3896230459213257, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 6.718130456854809, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.08521527796983719, "logits/rejected": 0.11168261617422104, "logps/chosen": -1.3500198125839233, "logps/rejected": -1.4981739521026611, "loss": 1.0938, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3500198125839233, "rewards/margins": 0.14815422892570496, "rewards/rejected": -1.4981739521026611, "sft_loss": 1.3889285326004028, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 7.101305216195323, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.017105095088481903, "logits/rejected": 0.07132132351398468, "logps/chosen": -1.2278684377670288, "logps/rejected": -1.3633949756622314, "loss": 1.067, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2278684377670288, "rewards/margins": 0.13552668690681458, "rewards/rejected": -1.3633949756622314, "sft_loss": 1.2899867296218872, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.380683221006391, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.015782993286848068, "logits/rejected": 0.07443811744451523, "logps/chosen": -1.2975280284881592, "logps/rejected": -1.398431658744812, "loss": 1.0852, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2975280284881592, "rewards/margins": 0.10090353339910507, "rewards/rejected": -1.398431658744812, "sft_loss": 1.3160045146942139, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 5.636935957579122, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.083624929189682, "logits/rejected": 0.023563571274280548, "logps/chosen": -1.3276898860931396, "logps/rejected": -1.4673435688018799, "loss": 1.1097, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3276898860931396, "rewards/margins": 0.13965357840061188, "rewards/rejected": -1.4673435688018799, "sft_loss": 1.3639371395111084, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 10.335663880408164, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.0073973932303488255, "logits/rejected": 0.11964131891727448, "logps/chosen": -1.3213504552841187, "logps/rejected": -1.461319088935852, "loss": 1.0537, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3213504552841187, "rewards/margins": 0.13996846973896027, "rewards/rejected": -1.461319088935852, "sft_loss": 1.3012510538101196, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 6.305064923156825, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.008607283234596252, "logits/rejected": 0.08251720666885376, "logps/chosen": -1.2554385662078857, "logps/rejected": -1.4584252834320068, "loss": 1.0472, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2554385662078857, "rewards/margins": 0.20298662781715393, "rewards/rejected": -1.4584252834320068, "sft_loss": 1.2757747173309326, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 7.5266659029364975, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.1394522488117218, "logits/rejected": -0.015918530523777008, "logps/chosen": -1.3858789205551147, "logps/rejected": -1.4430443048477173, "loss": 1.1494, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3858789205551147, "rewards/margins": 0.057165395468473434, "rewards/rejected": -1.4430443048477173, "sft_loss": 1.4158474206924438, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 6.809377978897288, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.1192874163389206, "logits/rejected": 0.14119035005569458, "logps/chosen": -1.2931272983551025, "logps/rejected": -1.4714761972427368, "loss": 1.0697, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2931272983551025, "rewards/margins": 0.17834879457950592, "rewards/rejected": -1.4714761972427368, "sft_loss": 1.297696828842163, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 6.671818465646382, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.1525087058544159, "logits/rejected": 0.1035248264670372, "logps/chosen": -1.2508265972137451, "logps/rejected": -1.4483669996261597, "loss": 1.0476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2508265972137451, "rewards/margins": 0.19754032790660858, "rewards/rejected": -1.4483669996261597, "sft_loss": 1.2858855724334717, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 6.6227441453418665, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.05956697463989258, "logits/rejected": 0.08246360719203949, "logps/chosen": -1.3117125034332275, "logps/rejected": -1.5495645999908447, "loss": 1.0497, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3117125034332275, "rewards/margins": 0.23785214126110077, "rewards/rejected": -1.5495645999908447, "sft_loss": 1.341576337814331, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 6.317720549483203, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.044476814568042755, "logits/rejected": 0.15844932198524475, "logps/chosen": -1.2904717922210693, "logps/rejected": -1.3672595024108887, "loss": 1.0942, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2904717922210693, "rewards/margins": 0.07678767293691635, "rewards/rejected": -1.3672595024108887, "sft_loss": 1.3166091442108154, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 8.218639401359006, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.012549695558845997, "logits/rejected": 0.025943463668227196, "logps/chosen": -1.3858129978179932, "logps/rejected": -1.479968786239624, "loss": 1.123, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3858129978179932, "rewards/margins": 0.09415578842163086, "rewards/rejected": -1.479968786239624, "sft_loss": 1.3809711933135986, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 5.9336956749373675, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.02575538493692875, "logits/rejected": 0.0991683155298233, "logps/chosen": -1.3275907039642334, "logps/rejected": -1.414987325668335, "loss": 1.1151, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3275907039642334, "rewards/margins": 0.08739662170410156, "rewards/rejected": -1.414987325668335, "sft_loss": 1.3196409940719604, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.283706408339409, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.060841191560029984, "logits/rejected": -0.03938784822821617, "logps/chosen": -1.3379141092300415, "logps/rejected": -1.456209421157837, "loss": 1.1165, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3379141092300415, "rewards/margins": 0.11829522997140884, "rewards/rejected": -1.456209421157837, "sft_loss": 1.3973592519760132, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 6.058178591419342, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.053350646048784256, "logits/rejected": 0.0480460450053215, "logps/chosen": -1.2455228567123413, "logps/rejected": -1.4103162288665771, "loss": 1.0704, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2455228567123413, "rewards/margins": 0.16479340195655823, "rewards/rejected": -1.4103162288665771, "sft_loss": 1.2850632667541504, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 8.871998038934212, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.03907988965511322, "logits/rejected": 0.11153552681207657, "logps/chosen": -1.3700611591339111, "logps/rejected": -1.4140938520431519, "loss": 1.1365, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3700611591339111, "rewards/margins": 0.044032808393239975, "rewards/rejected": -1.4140938520431519, "sft_loss": 1.3838356733322144, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 8.170312428775517, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.09565381705760956, "logits/rejected": 0.15702112019062042, "logps/chosen": -1.3171789646148682, "logps/rejected": -1.503444790840149, "loss": 1.0555, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3171789646148682, "rewards/margins": 0.18626593053340912, "rewards/rejected": -1.503444790840149, "sft_loss": 1.2960052490234375, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 4.850778687239608, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.0379464253783226, "logits/rejected": 0.13338619470596313, "logps/chosen": -1.260594129562378, "logps/rejected": -1.420748233795166, "loss": 1.0678, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.260594129562378, "rewards/margins": 0.16015416383743286, "rewards/rejected": -1.420748233795166, "sft_loss": 1.2914608716964722, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 4.985466695799678, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.09366512298583984, "logits/rejected": 0.05123235657811165, "logps/chosen": -1.307841181755066, "logps/rejected": -1.4251517057418823, "loss": 1.1218, "rewards/accuracies": 0.53125, "rewards/chosen": -1.307841181755066, "rewards/margins": 0.11731058359146118, "rewards/rejected": -1.4251517057418823, "sft_loss": 1.4095752239227295, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 11.716219695853741, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.10370634496212006, "logits/rejected": 0.1765981912612915, "logps/chosen": -1.2913461923599243, "logps/rejected": -1.4900809526443481, "loss": 1.0875, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2913461923599243, "rewards/margins": 0.19873474538326263, "rewards/rejected": -1.4900809526443481, "sft_loss": 1.3755576610565186, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 5.036271191010346, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.06814324855804443, "logits/rejected": 0.15410225093364716, "logps/chosen": -1.259528398513794, "logps/rejected": -1.3755052089691162, "loss": 1.0642, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.259528398513794, "rewards/margins": 0.1159767359495163, "rewards/rejected": -1.3755052089691162, "sft_loss": 1.227862000465393, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.019426570113224, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.10004373639822006, "logits/rejected": 0.16594137251377106, "logps/chosen": -1.270310640335083, "logps/rejected": -1.3529855012893677, "loss": 1.0724, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.270310640335083, "rewards/margins": 0.08267480880022049, "rewards/rejected": -1.3529855012893677, "sft_loss": 1.2502983808517456, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 5.393200355549483, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.009550745598971844, "logits/rejected": 0.08285556733608246, "logps/chosen": -1.3991400003433228, "logps/rejected": -1.479514479637146, "loss": 1.1429, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3991400003433228, "rewards/margins": 0.08037451654672623, "rewards/rejected": -1.479514479637146, "sft_loss": 1.4356224536895752, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 5.7140958550538, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.08716743439435959, "logits/rejected": 0.11977878957986832, "logps/chosen": -1.311036229133606, "logps/rejected": -1.4368469715118408, "loss": 1.0752, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.311036229133606, "rewards/margins": 0.12581077218055725, "rewards/rejected": -1.4368469715118408, "sft_loss": 1.3234432935714722, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 6.076491006812562, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.0319821834564209, "logits/rejected": 0.10077917575836182, "logps/chosen": -1.3090689182281494, "logps/rejected": -1.4595156908035278, "loss": 1.06, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3090689182281494, "rewards/margins": 0.150446817278862, "rewards/rejected": -1.4595156908035278, "sft_loss": 1.2954964637756348, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 8.604477479782899, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.07530542463064194, "logits/rejected": 0.05456411838531494, "logps/chosen": -1.385756492614746, "logps/rejected": -1.464805245399475, "loss": 1.1424, "rewards/accuracies": 0.5625, "rewards/chosen": -1.385756492614746, "rewards/margins": 0.07904873043298721, "rewards/rejected": -1.464805245399475, "sft_loss": 1.3918272256851196, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 6.859470538857442, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.042590927332639694, "logits/rejected": 0.05760595202445984, "logps/chosen": -1.2400459051132202, "logps/rejected": -1.393143653869629, "loss": 1.0823, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2400459051132202, "rewards/margins": 0.15309767425060272, "rewards/rejected": -1.393143653869629, "sft_loss": 1.3680957555770874, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 5.010480197078524, "learning_rate": 9.999984476788462e-07, "logits/chosen": -0.0016426980728283525, "logits/rejected": 0.0538158118724823, "logps/chosen": -1.3501454591751099, "logps/rejected": -1.484586238861084, "loss": 1.1061, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3501454591751099, "rewards/margins": 0.13444092869758606, "rewards/rejected": -1.484586238861084, "sft_loss": 1.3812997341156006, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 7.232570424800202, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.06226827949285507, "logits/rejected": 0.1579519659280777, "logps/chosen": -1.319981336593628, "logps/rejected": -1.4229294061660767, "loss": 1.1295, "rewards/accuracies": 0.5, "rewards/chosen": -1.319981336593628, "rewards/margins": 0.10294802486896515, "rewards/rejected": -1.4229294061660767, "sft_loss": 1.4071719646453857, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 5.655540568810285, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.046928636729717255, "logits/rejected": 0.013672498986124992, "logps/chosen": -1.2534117698669434, "logps/rejected": -1.3882801532745361, "loss": 1.063, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2534117698669434, "rewards/margins": 0.13486838340759277, "rewards/rejected": -1.3882801532745361, "sft_loss": 1.2757145166397095, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 5.707833022797435, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.049519095569849014, "logits/rejected": 0.10778944194316864, "logps/chosen": -1.2623019218444824, "logps/rejected": -1.459825038909912, "loss": 1.0382, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2623019218444824, "rewards/margins": 0.19752296805381775, "rewards/rejected": -1.459825038909912, "sft_loss": 1.285144567489624, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 6.503149678617331, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.10968382656574249, "logits/rejected": -0.006436157040297985, "logps/chosen": -1.4161120653152466, "logps/rejected": -1.4973254203796387, "loss": 1.1564, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4161120653152466, "rewards/margins": 0.08121319115161896, "rewards/rejected": -1.4973254203796387, "sft_loss": 1.4389655590057373, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 8.15081653817286, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.049354761838912964, "logits/rejected": 0.0784081220626831, "logps/chosen": -1.3722199201583862, "logps/rejected": -1.419933557510376, "loss": 1.1471, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3722199201583862, "rewards/margins": 0.04771358519792557, "rewards/rejected": -1.419933557510376, "sft_loss": 1.4077204465866089, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 6.3894639571813, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.009565544314682484, "logits/rejected": 0.14058226346969604, "logps/chosen": -1.2977478504180908, "logps/rejected": -1.4063743352890015, "loss": 1.0868, "rewards/accuracies": 0.5, "rewards/chosen": -1.2977478504180908, "rewards/margins": 0.108626589179039, "rewards/rejected": -1.4063743352890015, "sft_loss": 1.3116247653961182, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.1902933563226625, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.07538153231143951, "logits/rejected": 0.0692978948354721, "logps/chosen": -1.2987381219863892, "logps/rejected": -1.34641432762146, "loss": 1.1182, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2987381219863892, "rewards/margins": 0.0476762130856514, "rewards/rejected": -1.34641432762146, "sft_loss": 1.3346712589263916, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 6.027709998924295, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.08905048668384552, "logits/rejected": 0.002031295094639063, "logps/chosen": -1.366342306137085, "logps/rejected": -1.5508638620376587, "loss": 1.101, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.366342306137085, "rewards/margins": 0.1845216453075409, "rewards/rejected": -1.5508638620376587, "sft_loss": 1.396206021308899, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 15.537170016358042, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.04228251427412033, "logits/rejected": 0.19599834084510803, "logps/chosen": -1.3465120792388916, "logps/rejected": -1.4623966217041016, "loss": 1.1151, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3465120792388916, "rewards/margins": 0.1158844456076622, "rewards/rejected": -1.4623966217041016, "sft_loss": 1.3602538108825684, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 5.428017296314078, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.07138704508543015, "logits/rejected": 0.03556728735566139, "logps/chosen": -1.3303929567337036, "logps/rejected": -1.5150206089019775, "loss": 1.0923, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3303929567337036, "rewards/margins": 0.1846274882555008, "rewards/rejected": -1.5150206089019775, "sft_loss": 1.3598021268844604, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 5.632474227462753, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.028153136372566223, "logits/rejected": 0.09476649016141891, "logps/chosen": -1.401899814605713, "logps/rejected": -1.5244743824005127, "loss": 1.1213, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.401899814605713, "rewards/margins": 0.12257473170757294, "rewards/rejected": -1.5244743824005127, "sft_loss": 1.391305685043335, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 7.736028855117305, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.03588408976793289, "logits/rejected": 0.15140631794929504, "logps/chosen": -1.3081789016723633, "logps/rejected": -1.4813811779022217, "loss": 1.0652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3081789016723633, "rewards/margins": 0.17320233583450317, "rewards/rejected": -1.4813811779022217, "sft_loss": 1.338536024093628, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 6.92638354225366, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.039926640689373016, "logits/rejected": 0.0706775039434433, "logps/chosen": -1.3458665609359741, "logps/rejected": -1.5222132205963135, "loss": 1.0572, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3458665609359741, "rewards/margins": 0.17634673416614532, "rewards/rejected": -1.5222132205963135, "sft_loss": 1.3203465938568115, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 4.937948526823435, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.02016545459628105, "logits/rejected": 0.22592997550964355, "logps/chosen": -1.4141805171966553, "logps/rejected": -1.503955602645874, "loss": 1.1504, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4141805171966553, "rewards/margins": 0.08977502584457397, "rewards/rejected": -1.503955602645874, "sft_loss": 1.4209810495376587, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 21.163919428320348, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.11903375387191772, "logits/rejected": 0.08460259437561035, "logps/chosen": -1.3530133962631226, "logps/rejected": -1.4804702997207642, "loss": 1.1239, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3530133962631226, "rewards/margins": 0.12745679914951324, "rewards/rejected": -1.4804702997207642, "sft_loss": 1.4217400550842285, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 6.355009326849635, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.09540441632270813, "logits/rejected": -0.0039695026353001595, "logps/chosen": -1.2626725435256958, "logps/rejected": -1.4676183462142944, "loss": 1.0236, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2626725435256958, "rewards/margins": 0.20494571328163147, "rewards/rejected": -1.4676183462142944, "sft_loss": 1.256805658340454, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 8.182729549160133, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.03559732064604759, "logits/rejected": 0.10734357684850693, "logps/chosen": -1.4046621322631836, "logps/rejected": -1.5828628540039062, "loss": 1.0986, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4046621322631836, "rewards/margins": 0.17820079624652863, "rewards/rejected": -1.5828628540039062, "sft_loss": 1.4164342880249023, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 7.017317393413723, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.0322679728269577, "logits/rejected": 0.0426609069108963, "logps/chosen": -1.3416144847869873, "logps/rejected": -1.5490596294403076, "loss": 1.0927, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3416144847869873, "rewards/margins": 0.2074451446533203, "rewards/rejected": -1.5490596294403076, "sft_loss": 1.353048324584961, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 5.997866167672772, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.010479466058313847, "logits/rejected": 0.14654429256916046, "logps/chosen": -1.3053934574127197, "logps/rejected": -1.3830502033233643, "loss": 1.1136, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3053934574127197, "rewards/margins": 0.0776568129658699, "rewards/rejected": -1.3830502033233643, "sft_loss": 1.3474743366241455, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 7.708771512424861, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.005743196699768305, "logits/rejected": 0.13655301928520203, "logps/chosen": -1.2623566389083862, "logps/rejected": -1.422377347946167, "loss": 1.0684, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2623566389083862, "rewards/margins": 0.1600208282470703, "rewards/rejected": -1.422377347946167, "sft_loss": 1.3066545724868774, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 5.59439241677133, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.028394797816872597, "logits/rejected": 0.18746501207351685, "logps/chosen": -1.2996008396148682, "logps/rejected": -1.3868398666381836, "loss": 1.1132, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2996008396148682, "rewards/margins": 0.08723914623260498, "rewards/rejected": -1.3868398666381836, "sft_loss": 1.3947397470474243, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 6.069672629787288, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.04292871430516243, "logits/rejected": 0.040585361421108246, "logps/chosen": -1.2948095798492432, "logps/rejected": -1.5414516925811768, "loss": 1.0479, "rewards/accuracies": 0.625, "rewards/chosen": -1.2948095798492432, "rewards/margins": 0.2466420829296112, "rewards/rejected": -1.5414516925811768, "sft_loss": 1.345273733139038, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 5.583661151396963, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.08234535157680511, "logits/rejected": 0.25308576226234436, "logps/chosen": -1.3450154066085815, "logps/rejected": -1.4483263492584229, "loss": 1.1309, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3450154066085815, "rewards/margins": 0.10331089794635773, "rewards/rejected": -1.4483263492584229, "sft_loss": 1.327911138534546, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 9.253153511063843, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.03376932814717293, "logits/rejected": 0.2015124261379242, "logps/chosen": -1.3188982009887695, "logps/rejected": -1.4139798879623413, "loss": 1.1083, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3188982009887695, "rewards/margins": 0.0950816422700882, "rewards/rejected": -1.4139798879623413, "sft_loss": 1.329420804977417, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 7.192508641833526, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.05541741102933884, "logits/rejected": 0.09754420816898346, "logps/chosen": -1.3204059600830078, "logps/rejected": -1.4669349193572998, "loss": 1.1117, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3204059600830078, "rewards/margins": 0.14652886986732483, "rewards/rejected": -1.4669349193572998, "sft_loss": 1.3720275163650513, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 5.811098505616518, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.01835859753191471, "logits/rejected": 0.11538205295801163, "logps/chosen": -1.2177239656448364, "logps/rejected": -1.4119118452072144, "loss": 1.037, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2177239656448364, "rewards/margins": 0.19418802857398987, "rewards/rejected": -1.4119118452072144, "sft_loss": 1.260658860206604, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 7.543525850507948, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.09179548919200897, "logits/rejected": 0.05748724937438965, "logps/chosen": -1.38742995262146, "logps/rejected": -1.4838206768035889, "loss": 1.1139, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.38742995262146, "rewards/margins": 0.0963907390832901, "rewards/rejected": -1.4838206768035889, "sft_loss": 1.3663232326507568, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 5.602787814154604, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.08347173035144806, "logits/rejected": 0.10157792270183563, "logps/chosen": -1.3246452808380127, "logps/rejected": -1.5341843366622925, "loss": 1.0643, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3246452808380127, "rewards/margins": 0.2095392644405365, "rewards/rejected": -1.5341843366622925, "sft_loss": 1.3606746196746826, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 5.9317855501058885, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.04531757906079292, "logits/rejected": 0.1470317542552948, "logps/chosen": -1.28559148311615, "logps/rejected": -1.4513322114944458, "loss": 1.0489, "rewards/accuracies": 0.59375, "rewards/chosen": -1.28559148311615, "rewards/margins": 0.16574081778526306, "rewards/rejected": -1.4513322114944458, "sft_loss": 1.2866220474243164, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 7.107217318726965, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.02601141855120659, "logits/rejected": 0.15081417560577393, "logps/chosen": -1.3638850450515747, "logps/rejected": -1.5114787817001343, "loss": 1.0756, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3638850450515747, "rewards/margins": 0.1475939303636551, "rewards/rejected": -1.5114787817001343, "sft_loss": 1.3407633304595947, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 6.8293223560449015, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.04921600967645645, "logits/rejected": 0.2499714344739914, "logps/chosen": -1.4132165908813477, "logps/rejected": -1.546648621559143, "loss": 1.1354, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4132165908813477, "rewards/margins": 0.13343189656734467, "rewards/rejected": -1.546648621559143, "sft_loss": 1.4309947490692139, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 6.7983001447550375, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.11556123197078705, "logits/rejected": 0.027849048376083374, "logps/chosen": -1.2401540279388428, "logps/rejected": -1.535406470298767, "loss": 1.0106, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2401540279388428, "rewards/margins": 0.29525232315063477, "rewards/rejected": -1.535406470298767, "sft_loss": 1.3017752170562744, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 6.005120314203222, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.13792164623737335, "logits/rejected": 0.031013095751404762, "logps/chosen": -1.3492605686187744, "logps/rejected": -1.5073153972625732, "loss": 1.0958, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3492605686187744, "rewards/margins": 0.1580551117658615, "rewards/rejected": -1.5073153972625732, "sft_loss": 1.3596335649490356, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 5.887621096403035, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.11696688830852509, "logits/rejected": -0.009747383184731007, "logps/chosen": -1.3687269687652588, "logps/rejected": -1.4918612241744995, "loss": 1.1015, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3687269687652588, "rewards/margins": 0.12313439697027206, "rewards/rejected": -1.4918612241744995, "sft_loss": 1.4068576097488403, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 10.029945413063814, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.0755590945482254, "logits/rejected": 0.10666684806346893, "logps/chosen": -1.448311686515808, "logps/rejected": -1.524222493171692, "loss": 1.1654, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.448311686515808, "rewards/margins": 0.07591084390878677, "rewards/rejected": -1.524222493171692, "sft_loss": 1.454717993736267, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 5.7304804725359375, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.06337063759565353, "logits/rejected": 0.1517437994480133, "logps/chosen": -1.4033968448638916, "logps/rejected": -1.6506102085113525, "loss": 1.1044, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4033968448638916, "rewards/margins": 0.24721336364746094, "rewards/rejected": -1.6506102085113525, "sft_loss": 1.3806891441345215, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 6.054151935024956, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.052485160529613495, "logits/rejected": 0.11813749372959137, "logps/chosen": -1.3426921367645264, "logps/rejected": -1.5462762117385864, "loss": 1.063, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3426921367645264, "rewards/margins": 0.20358403027057648, "rewards/rejected": -1.5462762117385864, "sft_loss": 1.3526374101638794, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 5.912686110630131, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.0018018543487414718, "logits/rejected": 0.14155825972557068, "logps/chosen": -1.3522013425827026, "logps/rejected": -1.491640329360962, "loss": 1.114, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3522013425827026, "rewards/margins": 0.1394389122724533, "rewards/rejected": -1.491640329360962, "sft_loss": 1.3876965045928955, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 4.741798302717173, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.0866033211350441, "logits/rejected": -0.01305533666163683, "logps/chosen": -1.3660171031951904, "logps/rejected": -1.5640350580215454, "loss": 1.0843, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3660171031951904, "rewards/margins": 0.1980178952217102, "rewards/rejected": -1.5640350580215454, "sft_loss": 1.3981744050979614, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 6.469291565478886, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.03893253952264786, "logits/rejected": 0.05785505101084709, "logps/chosen": -1.3518002033233643, "logps/rejected": -1.482414960861206, "loss": 1.0818, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3518002033233643, "rewards/margins": 0.13061493635177612, "rewards/rejected": -1.482414960861206, "sft_loss": 1.3298957347869873, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 6.762984889125461, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.033855509012937546, "logits/rejected": 0.16177025437355042, "logps/chosen": -1.291409969329834, "logps/rejected": -1.590272068977356, "loss": 1.0386, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.291409969329834, "rewards/margins": 0.2988620400428772, "rewards/rejected": -1.590272068977356, "sft_loss": 1.3071105480194092, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 6.759201707442236, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.12731292843818665, "logits/rejected": 0.047011490911245346, "logps/chosen": -1.3670367002487183, "logps/rejected": -1.5798918008804321, "loss": 1.077, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3670367002487183, "rewards/margins": 0.21285513043403625, "rewards/rejected": -1.5798918008804321, "sft_loss": 1.3249647617340088, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 4.474840266700944, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.17214761674404144, "logits/rejected": -0.01971476711332798, "logps/chosen": -1.2922381162643433, "logps/rejected": -1.5108747482299805, "loss": 1.0514, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2922381162643433, "rewards/margins": 0.21863672137260437, "rewards/rejected": -1.5108747482299805, "sft_loss": 1.3367199897766113, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 5.417824675948416, "learning_rate": 9.951398126243133e-07, "logits/chosen": -0.0019948245026171207, "logits/rejected": 0.12875740230083466, "logps/chosen": -1.2788978815078735, "logps/rejected": -1.5393446683883667, "loss": 1.0361, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2788978815078735, "rewards/margins": 0.2604469656944275, "rewards/rejected": -1.5393446683883667, "sft_loss": 1.2987463474273682, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 6.207791305444593, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.09008261561393738, "logits/rejected": 0.040010981261730194, "logps/chosen": -1.337257981300354, "logps/rejected": -1.4963468313217163, "loss": 1.0769, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.337257981300354, "rewards/margins": 0.15908858180046082, "rewards/rejected": -1.4963468313217163, "sft_loss": 1.3328725099563599, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 7.907547690864367, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.08714650571346283, "logits/rejected": 0.11193932592868805, "logps/chosen": -1.3179038763046265, "logps/rejected": -1.5680185556411743, "loss": 1.0667, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3179038763046265, "rewards/margins": 0.25011464953422546, "rewards/rejected": -1.5680185556411743, "sft_loss": 1.342376470565796, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 5.856203248101176, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.06098737567663193, "logits/rejected": 0.02430593967437744, "logps/chosen": -1.322656273841858, "logps/rejected": -1.4689748287200928, "loss": 1.0764, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.322656273841858, "rewards/margins": 0.14631858468055725, "rewards/rejected": -1.4689748287200928, "sft_loss": 1.310194730758667, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.28842443227767944, "eval_logits/rejected": 0.38061627745628357, "eval_logps/chosen": -1.3602670431137085, "eval_logps/rejected": -1.5872876644134521, "eval_loss": 1.0738680362701416, "eval_rewards/accuracies": 0.5823442339897156, "eval_rewards/chosen": -1.3602670431137085, "eval_rewards/margins": 0.22702065110206604, "eval_rewards/rejected": -1.5872876644134521, "eval_runtime": 43.4088, "eval_samples_per_second": 30.985, "eval_sft_loss": 1.3759210109710693, "eval_steps_per_second": 7.763, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 7.53475500870321, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.11175141483545303, "logits/rejected": 0.053176987916231155, "logps/chosen": -1.361185073852539, "logps/rejected": -1.6331669092178345, "loss": 1.0675, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.361185073852539, "rewards/margins": 0.2719815969467163, "rewards/rejected": -1.6331669092178345, "sft_loss": 1.3797080516815186, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 6.7075427581976514, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.023669257760047913, "logits/rejected": 0.10120322555303574, "logps/chosen": -1.2819058895111084, "logps/rejected": -1.5174692869186401, "loss": 1.0449, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2819058895111084, "rewards/margins": 0.23556344211101532, "rewards/rejected": -1.5174692869186401, "sft_loss": 1.2974836826324463, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 10.800344000340381, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.029402485117316246, "logits/rejected": 0.17515210807323456, "logps/chosen": -1.2987323999404907, "logps/rejected": -1.6028416156768799, "loss": 1.0363, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2987323999404907, "rewards/margins": 0.30410921573638916, "rewards/rejected": -1.6028416156768799, "sft_loss": 1.3477541208267212, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 6.970876705465844, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.017118550837039948, "logits/rejected": 0.06861446797847748, "logps/chosen": -1.3582419157028198, "logps/rejected": -1.6129653453826904, "loss": 1.0714, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3582419157028198, "rewards/margins": 0.25472337007522583, "rewards/rejected": -1.6129653453826904, "sft_loss": 1.3721500635147095, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 12.073160531115478, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.054477252066135406, "logits/rejected": 0.07593884319067001, "logps/chosen": -1.2791856527328491, "logps/rejected": -1.4791104793548584, "loss": 1.093, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2791856527328491, "rewards/margins": 0.19992482662200928, "rewards/rejected": -1.4791104793548584, "sft_loss": 1.3659038543701172, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 6.829016103118746, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.017300017178058624, "logits/rejected": 0.18187452852725983, "logps/chosen": -1.365114450454712, "logps/rejected": -1.502744197845459, "loss": 1.0911, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.365114450454712, "rewards/margins": 0.13762979209423065, "rewards/rejected": -1.502744197845459, "sft_loss": 1.3418179750442505, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 6.897677799362671, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.013731849379837513, "logits/rejected": 0.12393651157617569, "logps/chosen": -1.2885291576385498, "logps/rejected": -1.485176682472229, "loss": 1.0742, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2885291576385498, "rewards/margins": 0.19664745032787323, "rewards/rejected": -1.485176682472229, "sft_loss": 1.3521944284439087, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 5.820938820543634, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.013512268662452698, "logits/rejected": 0.14876945316791534, "logps/chosen": -1.3282172679901123, "logps/rejected": -1.5874078273773193, "loss": 1.0753, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3282172679901123, "rewards/margins": 0.25919073820114136, "rewards/rejected": -1.5874078273773193, "sft_loss": 1.3661084175109863, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 3.816211993237183, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.07689845561981201, "logits/rejected": 0.15110646188259125, "logps/chosen": -1.3248929977416992, "logps/rejected": -1.463343620300293, "loss": 1.1157, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3248929977416992, "rewards/margins": 0.13845068216323853, "rewards/rejected": -1.463343620300293, "sft_loss": 1.388550043106079, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 6.669844224252639, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.0023181817959994078, "logits/rejected": 0.07454365491867065, "logps/chosen": -1.2944786548614502, "logps/rejected": -1.496414303779602, "loss": 1.0347, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2944786548614502, "rewards/margins": 0.20193564891815186, "rewards/rejected": -1.496414303779602, "sft_loss": 1.311168909072876, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 8.563766361912304, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.014415117911994457, "logits/rejected": 0.1365610510110855, "logps/chosen": -1.3581111431121826, "logps/rejected": -1.6337801218032837, "loss": 1.0798, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3581111431121826, "rewards/margins": 0.2756689190864563, "rewards/rejected": -1.6337801218032837, "sft_loss": 1.3649697303771973, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 5.764935710539851, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.06975705921649933, "logits/rejected": 0.10214301198720932, "logps/chosen": -1.3912808895111084, "logps/rejected": -1.6792808771133423, "loss": 1.0662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3912808895111084, "rewards/margins": 0.28800004720687866, "rewards/rejected": -1.6792808771133423, "sft_loss": 1.4005149602890015, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 9.952313300460148, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.00964144803583622, "logits/rejected": 0.09255403280258179, "logps/chosen": -1.3124510049819946, "logps/rejected": -1.6167314052581787, "loss": 1.0525, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3124510049819946, "rewards/margins": 0.30428043007850647, "rewards/rejected": -1.6167314052581787, "sft_loss": 1.3130879402160645, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 9.218895866731891, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.10176321119070053, "logits/rejected": 0.16877254843711853, "logps/chosen": -1.362540602684021, "logps/rejected": -1.5649629831314087, "loss": 1.1078, "rewards/accuracies": 0.53125, "rewards/chosen": -1.362540602684021, "rewards/margins": 0.2024223804473877, "rewards/rejected": -1.5649629831314087, "sft_loss": 1.4236619472503662, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 6.9159090268667995, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.005976744927465916, "logits/rejected": 0.08559007197618484, "logps/chosen": -1.3926869630813599, "logps/rejected": -1.5787603855133057, "loss": 1.1216, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3926869630813599, "rewards/margins": 0.1860734224319458, "rewards/rejected": -1.5787603855133057, "sft_loss": 1.3701099157333374, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 5.733344853923304, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.00022365078621078283, "logits/rejected": 0.16461063921451569, "logps/chosen": -1.4307525157928467, "logps/rejected": -1.6447960138320923, "loss": 1.088, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4307525157928467, "rewards/margins": 0.21404337882995605, "rewards/rejected": -1.6447960138320923, "sft_loss": 1.3551019430160522, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 7.0965498514729575, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.0713481456041336, "logits/rejected": 0.010779242031276226, "logps/chosen": -1.3277537822723389, "logps/rejected": -1.5847175121307373, "loss": 1.0414, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3277537822723389, "rewards/margins": 0.2569636106491089, "rewards/rejected": -1.5847175121307373, "sft_loss": 1.3591192960739136, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 6.339401667531269, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.17879191040992737, "logits/rejected": -0.05101003497838974, "logps/chosen": -1.3813947439193726, "logps/rejected": -1.5883454084396362, "loss": 1.095, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3813947439193726, "rewards/margins": 0.20695054531097412, "rewards/rejected": -1.5883454084396362, "sft_loss": 1.425832748413086, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 5.747980078764873, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.030855897814035416, "logits/rejected": 0.14914752542972565, "logps/chosen": -1.258709192276001, "logps/rejected": -1.4399124383926392, "loss": 1.0615, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.258709192276001, "rewards/margins": 0.1812031865119934, "rewards/rejected": -1.4399124383926392, "sft_loss": 1.3080114126205444, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.999983841636828, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.09650512784719467, "logits/rejected": -0.025805041193962097, "logps/chosen": -1.2652348279953003, "logps/rejected": -1.5569730997085571, "loss": 1.0381, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2652348279953003, "rewards/margins": 0.2917383015155792, "rewards/rejected": -1.5569730997085571, "sft_loss": 1.3040887117385864, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 5.729597922053128, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.006761780474334955, "logits/rejected": 0.1825835257768631, "logps/chosen": -1.3510396480560303, "logps/rejected": -1.6165297031402588, "loss": 1.0879, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3510396480560303, "rewards/margins": 0.2654899060726166, "rewards/rejected": -1.6165297031402588, "sft_loss": 1.396750569343567, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 3.9699737151417662, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.04263655096292496, "logits/rejected": 0.07031328976154327, "logps/chosen": -1.3665261268615723, "logps/rejected": -1.5466525554656982, "loss": 1.1073, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3665261268615723, "rewards/margins": 0.18012654781341553, "rewards/rejected": -1.5466525554656982, "sft_loss": 1.358182668685913, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 5.605907446139715, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.010189378634095192, "logits/rejected": 0.20491977035999298, "logps/chosen": -1.3190621137619019, "logps/rejected": -1.5155293941497803, "loss": 1.0695, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3190621137619019, "rewards/margins": 0.196467325091362, "rewards/rejected": -1.5155293941497803, "sft_loss": 1.3298662900924683, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 6.713530471211944, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.11538930237293243, "logits/rejected": 0.26346588134765625, "logps/chosen": -1.2698981761932373, "logps/rejected": -1.5307300090789795, "loss": 1.0583, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2698981761932373, "rewards/margins": 0.26083195209503174, "rewards/rejected": -1.5307300090789795, "sft_loss": 1.3178085088729858, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 5.811831680741933, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.04077720642089844, "logits/rejected": 0.13811177015304565, "logps/chosen": -1.375215768814087, "logps/rejected": -1.5146197080612183, "loss": 1.1019, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.375215768814087, "rewards/margins": 0.1394037902355194, "rewards/rejected": -1.5146197080612183, "sft_loss": 1.364015817642212, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 8.295351988413582, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.054763637483119965, "logits/rejected": 0.06431882083415985, "logps/chosen": -1.3140287399291992, "logps/rejected": -1.6464738845825195, "loss": 1.0531, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3140287399291992, "rewards/margins": 0.33244508504867554, "rewards/rejected": -1.6464738845825195, "sft_loss": 1.3359577655792236, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 5.337603380866405, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.0186604093760252, "logits/rejected": 0.06460899114608765, "logps/chosen": -1.2734229564666748, "logps/rejected": -1.526149868965149, "loss": 1.0612, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2734229564666748, "rewards/margins": 0.2527269423007965, "rewards/rejected": -1.526149868965149, "sft_loss": 1.3169612884521484, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 7.876389679343498, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.033787619322538376, "logits/rejected": 0.08481265604496002, "logps/chosen": -1.3385286331176758, "logps/rejected": -1.5050402879714966, "loss": 1.089, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3385286331176758, "rewards/margins": 0.1665116548538208, "rewards/rejected": -1.5050402879714966, "sft_loss": 1.3689563274383545, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 9.469117695278278, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.06023671478033066, "logits/rejected": 0.07909716665744781, "logps/chosen": -1.3198426961898804, "logps/rejected": -1.6279243230819702, "loss": 1.0234, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3198426961898804, "rewards/margins": 0.30808156728744507, "rewards/rejected": -1.6279243230819702, "sft_loss": 1.31898832321167, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 6.068755565762643, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.02013743296265602, "logits/rejected": 0.16080233454704285, "logps/chosen": -1.425286054611206, "logps/rejected": -1.582793951034546, "loss": 1.1457, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.425286054611206, "rewards/margins": 0.157507985830307, "rewards/rejected": -1.582793951034546, "sft_loss": 1.4227519035339355, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 9.116465562050807, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.022849196568131447, "logits/rejected": 0.258659303188324, "logps/chosen": -1.4186164140701294, "logps/rejected": -1.5973553657531738, "loss": 1.1386, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4186164140701294, "rewards/margins": 0.17873908579349518, "rewards/rejected": -1.5973553657531738, "sft_loss": 1.4624502658843994, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 6.918046448941572, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.0351165309548378, "logits/rejected": 0.11834853887557983, "logps/chosen": -1.3530397415161133, "logps/rejected": -1.4933912754058838, "loss": 1.1083, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3530397415161133, "rewards/margins": 0.14035165309906006, "rewards/rejected": -1.4933912754058838, "sft_loss": 1.3699219226837158, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 6.1912600134012505, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.05374523252248764, "logits/rejected": 0.05349243804812431, "logps/chosen": -1.3894104957580566, "logps/rejected": -1.6738580465316772, "loss": 1.0849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3894104957580566, "rewards/margins": 0.28444749116897583, "rewards/rejected": -1.6738580465316772, "sft_loss": 1.4124128818511963, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.065747116617297, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.06455997377634048, "logits/rejected": 0.07006511837244034, "logps/chosen": -1.2654345035552979, "logps/rejected": -1.4739532470703125, "loss": 1.0503, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2654345035552979, "rewards/margins": 0.20851869881153107, "rewards/rejected": -1.4739532470703125, "sft_loss": 1.2775202989578247, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 5.327061763368941, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.10575266182422638, "logits/rejected": 0.08543635904788971, "logps/chosen": -1.3447576761245728, "logps/rejected": -1.5180432796478271, "loss": 1.0823, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3447576761245728, "rewards/margins": 0.17328575253486633, "rewards/rejected": -1.5180432796478271, "sft_loss": 1.3948814868927002, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 8.020182464396328, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.08101774752140045, "logits/rejected": 0.1236882358789444, "logps/chosen": -1.3992125988006592, "logps/rejected": -1.600408911705017, "loss": 1.1059, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3992125988006592, "rewards/margins": 0.20119652152061462, "rewards/rejected": -1.600408911705017, "sft_loss": 1.4059171676635742, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 6.744511276591418, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.012089039199054241, "logits/rejected": 0.20911423861980438, "logps/chosen": -1.3623487949371338, "logps/rejected": -1.5705373287200928, "loss": 1.1032, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3623487949371338, "rewards/margins": 0.20818853378295898, "rewards/rejected": -1.5705373287200928, "sft_loss": 1.4302294254302979, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 5.064980584747938, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.06235666945576668, "logits/rejected": 0.09905209392309189, "logps/chosen": -1.208686351776123, "logps/rejected": -1.4888397455215454, "loss": 1.0117, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.208686351776123, "rewards/margins": 0.28015339374542236, "rewards/rejected": -1.4888397455215454, "sft_loss": 1.2937543392181396, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 6.429721404080052, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.08417926728725433, "logits/rejected": 0.16567587852478027, "logps/chosen": -1.326179027557373, "logps/rejected": -1.454859972000122, "loss": 1.1338, "rewards/accuracies": 0.5, "rewards/chosen": -1.326179027557373, "rewards/margins": 0.1286809742450714, "rewards/rejected": -1.454859972000122, "sft_loss": 1.3937551975250244, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 5.9936758517936415, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.06984042376279831, "logits/rejected": 0.19560235738754272, "logps/chosen": -1.3520762920379639, "logps/rejected": -1.6111743450164795, "loss": 1.0875, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3520762920379639, "rewards/margins": 0.2590981125831604, "rewards/rejected": -1.6111743450164795, "sft_loss": 1.3810752630233765, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 9.248342880601742, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.041423261165618896, "logits/rejected": 0.2167244404554367, "logps/chosen": -1.368880271911621, "logps/rejected": -1.5877668857574463, "loss": 1.1082, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.368880271911621, "rewards/margins": 0.21888649463653564, "rewards/rejected": -1.5877668857574463, "sft_loss": 1.413527250289917, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 8.22317017700777, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.007480998523533344, "logits/rejected": 0.2067534625530243, "logps/chosen": -1.225550889968872, "logps/rejected": -1.5645802021026611, "loss": 1.0161, "rewards/accuracies": 0.5625, "rewards/chosen": -1.225550889968872, "rewards/margins": 0.33902937173843384, "rewards/rejected": -1.5645802021026611, "sft_loss": 1.2699626684188843, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 6.876544893331995, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.02531070075929165, "logits/rejected": 0.15708212554454803, "logps/chosen": -1.2901298999786377, "logps/rejected": -1.5163029432296753, "loss": 1.06, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2901298999786377, "rewards/margins": 0.22617287933826447, "rewards/rejected": -1.5163029432296753, "sft_loss": 1.3433908224105835, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 6.719145985760094, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.0003074899432249367, "logits/rejected": 0.15064993500709534, "logps/chosen": -1.3295269012451172, "logps/rejected": -1.5829570293426514, "loss": 1.0556, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3295269012451172, "rewards/margins": 0.2534298896789551, "rewards/rejected": -1.5829570293426514, "sft_loss": 1.3862606287002563, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 8.751983877055366, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.02134351059794426, "logits/rejected": 0.10547629743814468, "logps/chosen": -1.2631934881210327, "logps/rejected": -1.5912244319915771, "loss": 1.0187, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2631934881210327, "rewards/margins": 0.3280307948589325, "rewards/rejected": -1.5912244319915771, "sft_loss": 1.288779616355896, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 8.947801392685536, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.0026383884251117706, "logits/rejected": 0.0938570499420166, "logps/chosen": -1.3454915285110474, "logps/rejected": -1.5478971004486084, "loss": 1.0977, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3454915285110474, "rewards/margins": 0.2024056613445282, "rewards/rejected": -1.5478971004486084, "sft_loss": 1.3874528408050537, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 8.030719702681695, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.02961505576968193, "logits/rejected": 0.0943761020898819, "logps/chosen": -1.4095746278762817, "logps/rejected": -1.5417417287826538, "loss": 1.1212, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4095746278762817, "rewards/margins": 0.13216717541217804, "rewards/rejected": -1.5417417287826538, "sft_loss": 1.4097391366958618, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 13.073240298154332, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.013182336464524269, "logits/rejected": 0.17865802347660065, "logps/chosen": -1.330731987953186, "logps/rejected": -1.6062644720077515, "loss": 1.0829, "rewards/accuracies": 0.625, "rewards/chosen": -1.330731987953186, "rewards/margins": 0.2755325734615326, "rewards/rejected": -1.6062644720077515, "sft_loss": 1.3806637525558472, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 3.859480468158749, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.0784904956817627, "logits/rejected": 0.066562220454216, "logps/chosen": -1.3023416996002197, "logps/rejected": -1.6012264490127563, "loss": 1.0615, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3023416996002197, "rewards/margins": 0.2988850474357605, "rewards/rejected": -1.6012264490127563, "sft_loss": 1.3957256078720093, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 7.389526699160587, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.08934468030929565, "logits/rejected": 0.01706491783261299, "logps/chosen": -1.3709447383880615, "logps/rejected": -1.700160026550293, "loss": 1.0676, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3709447383880615, "rewards/margins": 0.3292153477668762, "rewards/rejected": -1.700160026550293, "sft_loss": 1.4127471446990967, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 8.982720974634528, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.013047914020717144, "logits/rejected": 0.0696062445640564, "logps/chosen": -1.3615453243255615, "logps/rejected": -1.6033151149749756, "loss": 1.0755, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3615453243255615, "rewards/margins": 0.24176998436450958, "rewards/rejected": -1.6033151149749756, "sft_loss": 1.4036492109298706, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 7.6494199427540135, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.025501590222120285, "logits/rejected": 0.20580200850963593, "logps/chosen": -1.4298983812332153, "logps/rejected": -1.6337827444076538, "loss": 1.1003, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4298983812332153, "rewards/margins": 0.20388436317443848, "rewards/rejected": -1.6337827444076538, "sft_loss": 1.4225795269012451, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 4.614253761473676, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.032683633267879486, "logits/rejected": 0.16721466183662415, "logps/chosen": -1.3229620456695557, "logps/rejected": -1.6731964349746704, "loss": 1.0445, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3229620456695557, "rewards/margins": 0.3502345085144043, "rewards/rejected": -1.6731964349746704, "sft_loss": 1.41048264503479, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 6.6237622453727605, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.1174614205956459, "logits/rejected": 0.1363617330789566, "logps/chosen": -1.3533555269241333, "logps/rejected": -1.6572144031524658, "loss": 1.0431, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3533555269241333, "rewards/margins": 0.3038588762283325, "rewards/rejected": -1.6572144031524658, "sft_loss": 1.357505202293396, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 5.882814130729912, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.021414924412965775, "logits/rejected": 0.11573544889688492, "logps/chosen": -1.308184266090393, "logps/rejected": -1.6624805927276611, "loss": 1.0415, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.308184266090393, "rewards/margins": 0.3542962670326233, "rewards/rejected": -1.6624805927276611, "sft_loss": 1.358493447303772, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 7.510823956083402, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.08719275146722794, "logits/rejected": 0.02298247441649437, "logps/chosen": -1.438455581665039, "logps/rejected": -1.5987141132354736, "loss": 1.1326, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.438455581665039, "rewards/margins": 0.1602584421634674, "rewards/rejected": -1.5987141132354736, "sft_loss": 1.4535404443740845, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 8.493464333567369, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.022415516898036003, "logits/rejected": 0.1457872837781906, "logps/chosen": -1.3727210760116577, "logps/rejected": -1.4455827474594116, "loss": 1.1389, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.3727210760116577, "rewards/margins": 0.0728617012500763, "rewards/rejected": -1.4455827474594116, "sft_loss": 1.382156491279602, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 7.6187882316184705, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.0488225594162941, "logits/rejected": 0.05780552700161934, "logps/chosen": -1.3416324853897095, "logps/rejected": -1.4273046255111694, "loss": 1.1184, "rewards/accuracies": 0.5, "rewards/chosen": -1.3416324853897095, "rewards/margins": 0.08567220717668533, "rewards/rejected": -1.4273046255111694, "sft_loss": 1.3312969207763672, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 6.128001263278665, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.1998719424009323, "logits/rejected": -0.07255266606807709, "logps/chosen": -1.2856649160385132, "logps/rejected": -1.519370675086975, "loss": 1.0475, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2856649160385132, "rewards/margins": 0.23370572924613953, "rewards/rejected": -1.519370675086975, "sft_loss": 1.3287267684936523, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 7.2254979243325606, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.06452328711748123, "logits/rejected": 0.03451596572995186, "logps/chosen": -1.197929859161377, "logps/rejected": -1.6241276264190674, "loss": 0.9772, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.197929859161377, "rewards/margins": 0.42619770765304565, "rewards/rejected": -1.6241276264190674, "sft_loss": 1.2388383150100708, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 7.583297166453061, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.09063072502613068, "logits/rejected": -0.011754634790122509, "logps/chosen": -1.3256213665008545, "logps/rejected": -1.5247026681900024, "loss": 1.0775, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3256213665008545, "rewards/margins": 0.19908128678798676, "rewards/rejected": -1.5247026681900024, "sft_loss": 1.3233109712600708, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 7.966476893906217, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.07386825978755951, "logits/rejected": 0.05149867385625839, "logps/chosen": -1.3055206537246704, "logps/rejected": -1.4467628002166748, "loss": 1.0725, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3055206537246704, "rewards/margins": 0.14124202728271484, "rewards/rejected": -1.4467628002166748, "sft_loss": 1.3087950944900513, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 11.249291657242251, "learning_rate": 9.705173583245643e-07, "logits/chosen": -0.02822117879986763, "logits/rejected": 0.08976884186267853, "logps/chosen": -1.2478373050689697, "logps/rejected": -1.5560951232910156, "loss": 0.9948, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2478373050689697, "rewards/margins": 0.3082577586174011, "rewards/rejected": -1.5560951232910156, "sft_loss": 1.2365633249282837, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 6.592433420949461, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.18813884258270264, "logits/rejected": -0.08145233243703842, "logps/chosen": -1.292750358581543, "logps/rejected": -1.5763019323349, "loss": 1.0412, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.292750358581543, "rewards/margins": 0.28355178236961365, "rewards/rejected": -1.5763019323349, "sft_loss": 1.325889229774475, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 9.103225273951479, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1960795670747757, "logits/rejected": -0.01546870730817318, "logps/chosen": -1.3597862720489502, "logps/rejected": -1.7043052911758423, "loss": 1.0409, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3597862720489502, "rewards/margins": 0.34451884031295776, "rewards/rejected": -1.7043052911758423, "sft_loss": 1.3517060279846191, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 8.953833006284222, "learning_rate": 9.689161844071755e-07, "logits/chosen": -0.015761854127049446, "logits/rejected": 0.04325942322611809, "logps/chosen": -1.3681681156158447, "logps/rejected": -1.6375095844268799, "loss": 1.0446, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3681681156158447, "rewards/margins": 0.26934143900871277, "rewards/rejected": -1.6375095844268799, "sft_loss": 1.3063232898712158, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 7.912221491269787, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.09429174661636353, "logits/rejected": 0.06249885633587837, "logps/chosen": -1.4401443004608154, "logps/rejected": -1.7265408039093018, "loss": 1.0669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4401443004608154, "rewards/margins": 0.286396324634552, "rewards/rejected": -1.7265408039093018, "sft_loss": 1.3381919860839844, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 7.290932210175596, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.09101923555135727, "logits/rejected": -0.04793179780244827, "logps/chosen": -1.3850233554840088, "logps/rejected": -1.5009348392486572, "loss": 1.137, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3850233554840088, "rewards/margins": 0.115911565721035, "rewards/rejected": -1.5009348392486572, "sft_loss": 1.4382996559143066, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 5.994763978269897, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.21217043697834015, "logits/rejected": -0.09275046736001968, "logps/chosen": -1.3658145666122437, "logps/rejected": -1.6350940465927124, "loss": 1.0997, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3658145666122437, "rewards/margins": 0.2692795395851135, "rewards/rejected": -1.6350940465927124, "sft_loss": 1.430191993713379, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 6.2537328293323515, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.18638640642166138, "logits/rejected": -0.04663598909974098, "logps/chosen": -1.2614182233810425, "logps/rejected": -1.5357428789138794, "loss": 1.0133, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2614182233810425, "rewards/margins": 0.27432459592819214, "rewards/rejected": -1.5357428789138794, "sft_loss": 1.2610071897506714, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 6.153085584027617, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.0817769393324852, "logits/rejected": 0.008442547172307968, "logps/chosen": -1.3463274240493774, "logps/rejected": -1.5101841688156128, "loss": 1.0983, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3463274240493774, "rewards/margins": 0.16385677456855774, "rewards/rejected": -1.5101841688156128, "sft_loss": 1.383152723312378, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 14.169619640940896, "learning_rate": 9.655911462268327e-07, "logits/chosen": -0.02574927732348442, "logits/rejected": 0.06538109481334686, "logps/chosen": -1.30341637134552, "logps/rejected": -1.5070956945419312, "loss": 1.07, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.30341637134552, "rewards/margins": 0.20367932319641113, "rewards/rejected": -1.5070956945419312, "sft_loss": 1.3562206029891968, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 5.081540117717634, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.07648883759975433, "logits/rejected": 0.006164224352687597, "logps/chosen": -1.256722092628479, "logps/rejected": -1.4922142028808594, "loss": 1.0285, "rewards/accuracies": 0.59375, "rewards/chosen": -1.256722092628479, "rewards/margins": 0.23549184203147888, "rewards/rejected": -1.4922142028808594, "sft_loss": 1.2796533107757568, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.366403260527972, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.10264239460229874, "logits/rejected": 0.056674420833587646, "logps/chosen": -1.4898585081100464, "logps/rejected": -1.6733713150024414, "loss": 1.1404, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4898585081100464, "rewards/margins": 0.18351267278194427, "rewards/rejected": -1.6733713150024414, "sft_loss": 1.449183702468872, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 4.08768961851226, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.08438001573085785, "logits/rejected": 0.0963730663061142, "logps/chosen": -1.2894665002822876, "logps/rejected": -1.5089442729949951, "loss": 1.0759, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2894665002822876, "rewards/margins": 0.21947786211967468, "rewards/rejected": -1.5089442729949951, "sft_loss": 1.3434022665023804, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.640060679407475, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.08749563992023468, "logits/rejected": 0.05261549353599548, "logps/chosen": -1.391072392463684, "logps/rejected": -1.680537462234497, "loss": 1.0486, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.391072392463684, "rewards/margins": 0.2894650399684906, "rewards/rejected": -1.680537462234497, "sft_loss": 1.3792742490768433, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 8.121299748032293, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.031509868800640106, "logits/rejected": 0.10663716495037079, "logps/chosen": -1.4060813188552856, "logps/rejected": -1.670353651046753, "loss": 1.0896, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4060813188552856, "rewards/margins": 0.26427239179611206, "rewards/rejected": -1.670353651046753, "sft_loss": 1.3924411535263062, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 7.942419990123221, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.09847302734851837, "logits/rejected": 0.049504801630973816, "logps/chosen": -1.392059564590454, "logps/rejected": -1.6771005392074585, "loss": 1.0557, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.392059564590454, "rewards/margins": 0.2850412428379059, "rewards/rejected": -1.6771005392074585, "sft_loss": 1.4009144306182861, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 5.441289218712803, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.04815680533647537, "logits/rejected": 0.0761982649564743, "logps/chosen": -1.2798337936401367, "logps/rejected": -1.6139675378799438, "loss": 1.0236, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2798337936401367, "rewards/margins": 0.3341337740421295, "rewards/rejected": -1.6139675378799438, "sft_loss": 1.3444349765777588, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 11.168127119077242, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.07909546047449112, "logits/rejected": -0.0054620252922177315, "logps/chosen": -1.3398972749710083, "logps/rejected": -1.5796291828155518, "loss": 1.077, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3398972749710083, "rewards/margins": 0.239732027053833, "rewards/rejected": -1.5796291828155518, "sft_loss": 1.3416160345077515, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.264872670173645, "eval_logits/rejected": 0.35892850160598755, "eval_logps/chosen": -1.3685014247894287, "eval_logps/rejected": -1.6704237461090088, "eval_loss": 1.0590972900390625, "eval_rewards/accuracies": 0.5934718251228333, "eval_rewards/chosen": -1.3685014247894287, "eval_rewards/margins": 0.30192235112190247, "eval_rewards/rejected": -1.6704237461090088, "eval_runtime": 43.4948, "eval_samples_per_second": 30.923, "eval_sft_loss": 1.382156491279602, "eval_steps_per_second": 7.748, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 8.162444465412007, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.19529876112937927, "logits/rejected": -0.012327780947089195, "logps/chosen": -1.3344361782073975, "logps/rejected": -1.5679818391799927, "loss": 1.071, "rewards/accuracies": 0.625, "rewards/chosen": -1.3344361782073975, "rewards/margins": 0.23354558646678925, "rewards/rejected": -1.5679818391799927, "sft_loss": 1.3671363592147827, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 8.134798688280451, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.09892721474170685, "logits/rejected": 0.10894634574651718, "logps/chosen": -1.3433425426483154, "logps/rejected": -1.5716124773025513, "loss": 1.0678, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3433425426483154, "rewards/margins": 0.22826996445655823, "rewards/rejected": -1.5716124773025513, "sft_loss": 1.3458704948425293, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 5.685370068762198, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.13253986835479736, "logits/rejected": -0.004379653837531805, "logps/chosen": -1.3532987833023071, "logps/rejected": -1.5336767435073853, "loss": 1.1017, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3532987833023071, "rewards/margins": 0.18037788569927216, "rewards/rejected": -1.5336767435073853, "sft_loss": 1.3750277757644653, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 7.141213912440162, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.045246172696352005, "logits/rejected": 0.16152077913284302, "logps/chosen": -1.2738714218139648, "logps/rejected": -1.4765288829803467, "loss": 1.06, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2738714218139648, "rewards/margins": 0.2026575803756714, "rewards/rejected": -1.4765288829803467, "sft_loss": 1.2613481283187866, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 6.258863579208841, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.18121086061000824, "logits/rejected": 0.03602520748972893, "logps/chosen": -1.3640462160110474, "logps/rejected": -1.5456712245941162, "loss": 1.0971, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3640462160110474, "rewards/margins": 0.18162491917610168, "rewards/rejected": -1.5456712245941162, "sft_loss": 1.3939239978790283, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 9.745810185971298, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.03982186317443848, "logits/rejected": 0.07728839665651321, "logps/chosen": -1.3038945198059082, "logps/rejected": -1.5501224994659424, "loss": 1.0451, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3038945198059082, "rewards/margins": 0.24622802436351776, "rewards/rejected": -1.5501224994659424, "sft_loss": 1.2884595394134521, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 5.964030136663616, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.04003802686929703, "logits/rejected": 0.10245601087808609, "logps/chosen": -1.343867301940918, "logps/rejected": -1.4991952180862427, "loss": 1.1028, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.343867301940918, "rewards/margins": 0.15532787144184113, "rewards/rejected": -1.4991952180862427, "sft_loss": 1.419036626815796, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 6.519019052078934, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.16727600991725922, "logits/rejected": -0.029840881004929543, "logps/chosen": -1.3048359155654907, "logps/rejected": -1.5225237607955933, "loss": 1.066, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3048359155654907, "rewards/margins": 0.21768799424171448, "rewards/rejected": -1.5225237607955933, "sft_loss": 1.3605889081954956, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 6.917448069209358, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.06976354122161865, "logits/rejected": 0.11428710073232651, "logps/chosen": -1.3906006813049316, "logps/rejected": -1.538474678993225, "loss": 1.1394, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3906006813049316, "rewards/margins": 0.14787396788597107, "rewards/rejected": -1.538474678993225, "sft_loss": 1.437873125076294, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 7.9944608337121705, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.04944934695959091, "logits/rejected": 0.16209319233894348, "logps/chosen": -1.4066098928451538, "logps/rejected": -1.5687237977981567, "loss": 1.1164, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4066098928451538, "rewards/margins": 0.1621139943599701, "rewards/rejected": -1.5687237977981567, "sft_loss": 1.4220317602157593, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 7.613567393058519, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.06999413669109344, "logits/rejected": 0.06490659713745117, "logps/chosen": -1.3217823505401611, "logps/rejected": -1.558611273765564, "loss": 1.0764, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3217823505401611, "rewards/margins": 0.23682892322540283, "rewards/rejected": -1.558611273765564, "sft_loss": 1.393017053604126, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 5.503462436845649, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.07714545726776123, "logits/rejected": 0.06864900887012482, "logps/chosen": -1.2933156490325928, "logps/rejected": -1.6474330425262451, "loss": 1.0371, "rewards/accuracies": 0.625, "rewards/chosen": -1.2933156490325928, "rewards/margins": 0.3541174829006195, "rewards/rejected": -1.6474330425262451, "sft_loss": 1.3451446294784546, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 6.962243867292986, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.0016354650724679232, "logits/rejected": 0.16194012761116028, "logps/chosen": -1.3304073810577393, "logps/rejected": -1.604431390762329, "loss": 1.0525, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3304073810577393, "rewards/margins": 0.2740240693092346, "rewards/rejected": -1.604431390762329, "sft_loss": 1.3525432348251343, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 7.674531775502987, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.09173519909381866, "logits/rejected": 0.021121881902217865, "logps/chosen": -1.3317499160766602, "logps/rejected": -1.6477981805801392, "loss": 1.0598, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3317499160766602, "rewards/margins": 0.31604841351509094, "rewards/rejected": -1.6477981805801392, "sft_loss": 1.3782113790512085, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 6.056954538944757, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.07323311269283295, "logits/rejected": 0.039187707006931305, "logps/chosen": -1.3623502254486084, "logps/rejected": -1.5875145196914673, "loss": 1.0739, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3623502254486084, "rewards/margins": 0.22516405582427979, "rewards/rejected": -1.5875145196914673, "sft_loss": 1.3416621685028076, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 8.226179672171405, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.056537926197052, "logits/rejected": 0.11088557541370392, "logps/chosen": -1.4244935512542725, "logps/rejected": -1.5980981588363647, "loss": 1.1354, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4244935512542725, "rewards/margins": 0.17360465228557587, "rewards/rejected": -1.5980981588363647, "sft_loss": 1.4504040479660034, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 7.010250854431581, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.054350245743989944, "logits/rejected": 0.09495130926370621, "logps/chosen": -1.2873237133026123, "logps/rejected": -1.4944299459457397, "loss": 1.0511, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2873237133026123, "rewards/margins": 0.20710627734661102, "rewards/rejected": -1.4944299459457397, "sft_loss": 1.3140829801559448, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 6.737089823542583, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.19877436757087708, "logits/rejected": -0.042660392820835114, "logps/chosen": -1.3604657649993896, "logps/rejected": -1.480218529701233, "loss": 1.1203, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3604657649993896, "rewards/margins": 0.11975283920764923, "rewards/rejected": -1.480218529701233, "sft_loss": 1.3572709560394287, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 5.140746161365096, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.08950953930616379, "logits/rejected": -0.07718075811862946, "logps/chosen": -1.3389971256256104, "logps/rejected": -1.635023832321167, "loss": 1.0446, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3389971256256104, "rewards/margins": 0.2960268557071686, "rewards/rejected": -1.635023832321167, "sft_loss": 1.3482633829116821, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 6.4103278006865585, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.09214359521865845, "logits/rejected": 0.17356061935424805, "logps/chosen": -1.3123610019683838, "logps/rejected": -1.6263983249664307, "loss": 1.0509, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3123610019683838, "rewards/margins": 0.31403714418411255, "rewards/rejected": -1.6263983249664307, "sft_loss": 1.3706796169281006, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 6.504675097223832, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.002763524651527405, "logits/rejected": 0.05632271245121956, "logps/chosen": -1.3333485126495361, "logps/rejected": -1.5826590061187744, "loss": 1.0595, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3333485126495361, "rewards/margins": 0.24931053817272186, "rewards/rejected": -1.5826590061187744, "sft_loss": 1.338838815689087, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 4.090952103579744, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.008335872553288937, "logits/rejected": 0.07565923780202866, "logps/chosen": -1.3219220638275146, "logps/rejected": -1.5549265146255493, "loss": 1.071, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3219220638275146, "rewards/margins": 0.2330043613910675, "rewards/rejected": -1.5549265146255493, "sft_loss": 1.3386104106903076, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 6.926733832435039, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.11416655778884888, "logits/rejected": 0.07172179222106934, "logps/chosen": -1.2963732481002808, "logps/rejected": -1.5955102443695068, "loss": 1.0273, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2963732481002808, "rewards/margins": 0.2991369962692261, "rewards/rejected": -1.5955102443695068, "sft_loss": 1.2894750833511353, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 4.802406989227033, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.1949145644903183, "logits/rejected": 0.018199989572167397, "logps/chosen": -1.267913579940796, "logps/rejected": -1.5648223161697388, "loss": 1.0175, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.267913579940796, "rewards/margins": 0.29690876603126526, "rewards/rejected": -1.5648223161697388, "sft_loss": 1.2981455326080322, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 5.30569400241199, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.03965713828802109, "logits/rejected": 0.07506690919399261, "logps/chosen": -1.3903119564056396, "logps/rejected": -1.5926454067230225, "loss": 1.1198, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3903119564056396, "rewards/margins": 0.2023334801197052, "rewards/rejected": -1.5926454067230225, "sft_loss": 1.3613229990005493, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 6.066655619066335, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.06363092362880707, "logits/rejected": 0.05990830063819885, "logps/chosen": -1.3233710527420044, "logps/rejected": -1.4781570434570312, "loss": 1.0817, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3233710527420044, "rewards/margins": 0.15478602051734924, "rewards/rejected": -1.4781570434570312, "sft_loss": 1.3233697414398193, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 5.762243489235919, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.028935562819242477, "logits/rejected": 0.05837502330541611, "logps/chosen": -1.395919919013977, "logps/rejected": -1.6004736423492432, "loss": 1.0824, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.395919919013977, "rewards/margins": 0.2045535296201706, "rewards/rejected": -1.6004736423492432, "sft_loss": 1.4127261638641357, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 8.214963066893405, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.03690364211797714, "logits/rejected": -0.0002823077084030956, "logps/chosen": -1.346161127090454, "logps/rejected": -1.5365341901779175, "loss": 1.0805, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.346161127090454, "rewards/margins": 0.19037306308746338, "rewards/rejected": -1.5365341901779175, "sft_loss": 1.3572635650634766, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 4.476913094970793, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.01798745058476925, "logits/rejected": 0.21475832164287567, "logps/chosen": -1.3314244747161865, "logps/rejected": -1.5622254610061646, "loss": 1.0663, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3314244747161865, "rewards/margins": 0.2308010309934616, "rewards/rejected": -1.5622254610061646, "sft_loss": 1.367760419845581, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 7.3383665115717776, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.0036095590330660343, "logits/rejected": 0.06418739259243011, "logps/chosen": -1.3319792747497559, "logps/rejected": -1.4216053485870361, "loss": 1.1099, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3319792747497559, "rewards/margins": 0.0896257609128952, "rewards/rejected": -1.4216053485870361, "sft_loss": 1.3568004369735718, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 5.1576036315994385, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.034273095428943634, "logits/rejected": 0.2666153609752655, "logps/chosen": -1.3899375200271606, "logps/rejected": -1.5520015954971313, "loss": 1.1164, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3899375200271606, "rewards/margins": 0.16206394135951996, "rewards/rejected": -1.5520015954971313, "sft_loss": 1.4064255952835083, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 6.155451220270838, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.141189306974411, "logits/rejected": 0.014295866712927818, "logps/chosen": -1.4161455631256104, "logps/rejected": -1.6726045608520508, "loss": 1.0969, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4161455631256104, "rewards/margins": 0.2564590871334076, "rewards/rejected": -1.6726045608520508, "sft_loss": 1.4574393033981323, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 5.507852253151474, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.013143645599484444, "logits/rejected": 0.13177946209907532, "logps/chosen": -1.343443512916565, "logps/rejected": -1.5579149723052979, "loss": 1.0692, "rewards/accuracies": 0.5625, "rewards/chosen": -1.343443512916565, "rewards/margins": 0.2144714891910553, "rewards/rejected": -1.5579149723052979, "sft_loss": 1.3730356693267822, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 16.701879236246025, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.0122370021417737, "logits/rejected": 0.11792914569377899, "logps/chosen": -1.3964431285858154, "logps/rejected": -1.7372715473175049, "loss": 1.0529, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3964431285858154, "rewards/margins": 0.3408281207084656, "rewards/rejected": -1.7372715473175049, "sft_loss": 1.4153270721435547, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 4.722020047322209, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.07226811349391937, "logits/rejected": 0.08238498121500015, "logps/chosen": -1.3164997100830078, "logps/rejected": -1.6869707107543945, "loss": 1.0365, "rewards/accuracies": 0.625, "rewards/chosen": -1.3164997100830078, "rewards/margins": 0.37047088146209717, "rewards/rejected": -1.6869707107543945, "sft_loss": 1.3838088512420654, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.955961811337593, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.11595847457647324, "logits/rejected": 0.016401495784521103, "logps/chosen": -1.367996096611023, "logps/rejected": -1.5372674465179443, "loss": 1.1093, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.367996096611023, "rewards/margins": 0.16927149891853333, "rewards/rejected": -1.5372674465179443, "sft_loss": 1.3482203483581543, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 11.099017032697603, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.08770633488893509, "logits/rejected": 0.05239544063806534, "logps/chosen": -1.2994236946105957, "logps/rejected": -1.551729440689087, "loss": 1.0551, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2994236946105957, "rewards/margins": 0.25230568647384644, "rewards/rejected": -1.551729440689087, "sft_loss": 1.331479549407959, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 9.492347311893651, "learning_rate": 9.34792587788002e-07, "logits/chosen": 0.004612951073795557, "logits/rejected": 0.12525108456611633, "logps/chosen": -1.3718888759613037, "logps/rejected": -1.640920877456665, "loss": 1.0616, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3718888759613037, "rewards/margins": 0.2690318822860718, "rewards/rejected": -1.640920877456665, "sft_loss": 1.4074128866195679, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 5.333530619271008, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.009407530538737774, "logits/rejected": 0.10629584640264511, "logps/chosen": -1.3189841508865356, "logps/rejected": -1.4889742136001587, "loss": 1.0625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3189841508865356, "rewards/margins": 0.1699899584054947, "rewards/rejected": -1.4889742136001587, "sft_loss": 1.3322023153305054, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 6.1020804158599855, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.17140880227088928, "logits/rejected": -0.028482386842370033, "logps/chosen": -1.4129021167755127, "logps/rejected": -1.6202716827392578, "loss": 1.1006, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4129021167755127, "rewards/margins": 0.2073693722486496, "rewards/rejected": -1.6202716827392578, "sft_loss": 1.4178996086120605, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 4.848542855963102, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.056819796562194824, "logits/rejected": 0.1313043087720871, "logps/chosen": -1.2349143028259277, "logps/rejected": -1.6562589406967163, "loss": 1.0076, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2349143028259277, "rewards/margins": 0.4213446080684662, "rewards/rejected": -1.6562589406967163, "sft_loss": 1.2959566116333008, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 7.14445775993856, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.08984025567770004, "logits/rejected": 0.10239746421575546, "logps/chosen": -1.3794463872909546, "logps/rejected": -1.649232268333435, "loss": 1.0763, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3794463872909546, "rewards/margins": 0.26978588104248047, "rewards/rejected": -1.649232268333435, "sft_loss": 1.4267511367797852, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 8.302373732710077, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.01051993016153574, "logits/rejected": 0.13935205340385437, "logps/chosen": -1.4041858911514282, "logps/rejected": -1.4739868640899658, "loss": 1.1334, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4041858911514282, "rewards/margins": 0.06980089098215103, "rewards/rejected": -1.4739868640899658, "sft_loss": 1.379386067390442, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 6.116793410992373, "learning_rate": 9.301028145701543e-07, "logits/chosen": 0.02948124334216118, "logits/rejected": 0.17535671591758728, "logps/chosen": -1.2956939935684204, "logps/rejected": -1.7455459833145142, "loss": 1.0261, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2956939935684204, "rewards/margins": 0.4498518407344818, "rewards/rejected": -1.7455459833145142, "sft_loss": 1.3457810878753662, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 5.8029302256255395, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.05574868991971016, "logits/rejected": 0.1520715355873108, "logps/chosen": -1.3259742259979248, "logps/rejected": -1.7329938411712646, "loss": 1.0178, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3259742259979248, "rewards/margins": 0.4070195257663727, "rewards/rejected": -1.7329938411712646, "sft_loss": 1.3466819524765015, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 10.907972977051783, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.061895061284303665, "logits/rejected": 0.0811760351061821, "logps/chosen": -1.3956642150878906, "logps/rejected": -1.5960217714309692, "loss": 1.0789, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3956642150878906, "rewards/margins": 0.2003575563430786, "rewards/rejected": -1.5960217714309692, "sft_loss": 1.374837875366211, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 4.703252203250831, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.0737505629658699, "logits/rejected": 0.13021281361579895, "logps/chosen": -1.2985069751739502, "logps/rejected": -1.6690765619277954, "loss": 1.0218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2985069751739502, "rewards/margins": 0.3705694377422333, "rewards/rejected": -1.6690765619277954, "sft_loss": 1.3173308372497559, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 4.46831297658942, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.04367053136229515, "logits/rejected": 0.06593044847249985, "logps/chosen": -1.3501296043395996, "logps/rejected": -1.5123838186264038, "loss": 1.1018, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3501296043395996, "rewards/margins": 0.1622542440891266, "rewards/rejected": -1.5123838186264038, "sft_loss": 1.399683952331543, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 7.067902265570916, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.11400493234395981, "logits/rejected": 0.10712490230798721, "logps/chosen": -1.3892908096313477, "logps/rejected": -1.571804165840149, "loss": 1.1281, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3892908096313477, "rewards/margins": 0.1825132519006729, "rewards/rejected": -1.571804165840149, "sft_loss": 1.450918197631836, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 8.924429900342966, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.027986442670226097, "logits/rejected": 0.12609614431858063, "logps/chosen": -1.3055108785629272, "logps/rejected": -1.4768112897872925, "loss": 1.081, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3055108785629272, "rewards/margins": 0.1713004857301712, "rewards/rejected": -1.4768112897872925, "sft_loss": 1.3337790966033936, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 9.113430126638274, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.0451820082962513, "logits/rejected": 0.1254032701253891, "logps/chosen": -1.3234608173370361, "logps/rejected": -1.6025323867797852, "loss": 1.0535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3234608173370361, "rewards/margins": 0.27907148003578186, "rewards/rejected": -1.6025323867797852, "sft_loss": 1.3641140460968018, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 5.923575127090414, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.027543995529413223, "logits/rejected": 0.14627543091773987, "logps/chosen": -1.2854974269866943, "logps/rejected": -1.573866844177246, "loss": 1.0222, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2854974269866943, "rewards/margins": 0.2883693277835846, "rewards/rejected": -1.573866844177246, "sft_loss": 1.2862838506698608, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 5.1028035186060094, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.012474549934267998, "logits/rejected": 0.09835416078567505, "logps/chosen": -1.3849401473999023, "logps/rejected": -1.6243913173675537, "loss": 1.0999, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3849401473999023, "rewards/margins": 0.23945105075836182, "rewards/rejected": -1.6243913173675537, "sft_loss": 1.4353958368301392, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 6.4051747902711265, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.07133413851261139, "logits/rejected": -0.03863609582185745, "logps/chosen": -1.3444093465805054, "logps/rejected": -1.5882580280303955, "loss": 1.0804, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3444093465805054, "rewards/margins": 0.2438487708568573, "rewards/rejected": -1.5882580280303955, "sft_loss": 1.387082815170288, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 6.734701551616222, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.11850512027740479, "logits/rejected": 0.05692233517765999, "logps/chosen": -1.3440381288528442, "logps/rejected": -1.6370413303375244, "loss": 1.056, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3440381288528442, "rewards/margins": 0.29300326108932495, "rewards/rejected": -1.6370413303375244, "sft_loss": 1.3866641521453857, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 5.882501221972273, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.021683160215616226, "logits/rejected": 0.09188304096460342, "logps/chosen": -1.312713861465454, "logps/rejected": -1.6861861944198608, "loss": 1.0403, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.312713861465454, "rewards/margins": 0.37347230315208435, "rewards/rejected": -1.6861861944198608, "sft_loss": 1.3612488508224487, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 6.189785230569557, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.003997665364295244, "logits/rejected": 0.16212308406829834, "logps/chosen": -1.393507957458496, "logps/rejected": -1.743019461631775, "loss": 1.0658, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.393507957458496, "rewards/margins": 0.34951135516166687, "rewards/rejected": -1.743019461631775, "sft_loss": 1.3766828775405884, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 8.010598484403992, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.05653812736272812, "logits/rejected": 0.06350454688072205, "logps/chosen": -1.360586166381836, "logps/rejected": -1.5594004392623901, "loss": 1.0994, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.360586166381836, "rewards/margins": 0.1988140493631363, "rewards/rejected": -1.5594004392623901, "sft_loss": 1.3619134426116943, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 7.260851698872519, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.044975005090236664, "logits/rejected": 0.06740692257881165, "logps/chosen": -1.508986473083496, "logps/rejected": -1.6397491693496704, "loss": 1.1746, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.508986473083496, "rewards/margins": 0.1307627260684967, "rewards/rejected": -1.6397491693496704, "sft_loss": 1.5416109561920166, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 16.709625608251123, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.0836515948176384, "logits/rejected": 0.021665522828698158, "logps/chosen": -1.392195463180542, "logps/rejected": -1.7002432346343994, "loss": 1.082, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.392195463180542, "rewards/margins": 0.30804774165153503, "rewards/rejected": -1.7002432346343994, "sft_loss": 1.4488755464553833, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 10.151517285966094, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.017516251653432846, "logits/rejected": 0.08750508725643158, "logps/chosen": -1.324232816696167, "logps/rejected": -1.7349544763565063, "loss": 0.9967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.324232816696167, "rewards/margins": 0.4107215404510498, "rewards/rejected": -1.7349544763565063, "sft_loss": 1.313400149345398, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 5.47793827688658, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.08696796745061874, "logits/rejected": 0.20715276896953583, "logps/chosen": -1.4028065204620361, "logps/rejected": -1.7313998937606812, "loss": 1.0586, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4028065204620361, "rewards/margins": 0.32859352231025696, "rewards/rejected": -1.7313998937606812, "sft_loss": 1.4195729494094849, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 5.933438633720938, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.024196814745664597, "logits/rejected": 0.04886165261268616, "logps/chosen": -1.329463243484497, "logps/rejected": -1.601779580116272, "loss": 1.0526, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.329463243484497, "rewards/margins": 0.2723161578178406, "rewards/rejected": -1.601779580116272, "sft_loss": 1.3440353870391846, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 13.207225278484374, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.026406964287161827, "logits/rejected": 0.06156226992607117, "logps/chosen": -1.371176838874817, "logps/rejected": -1.6098140478134155, "loss": 1.0931, "rewards/accuracies": 0.5625, "rewards/chosen": -1.371176838874817, "rewards/margins": 0.23863713443279266, "rewards/rejected": -1.6098140478134155, "sft_loss": 1.4641458988189697, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 5.810313113640756, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.058356620371341705, "logits/rejected": 0.13299037516117096, "logps/chosen": -1.3551055192947388, "logps/rejected": -1.5649089813232422, "loss": 1.0987, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3551055192947388, "rewards/margins": 0.20980365574359894, "rewards/rejected": -1.5649089813232422, "sft_loss": 1.3459253311157227, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 8.495492846008881, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.07601507008075714, "logits/rejected": 0.03884817287325859, "logps/chosen": -1.3821711540222168, "logps/rejected": -1.6108427047729492, "loss": 1.0717, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3821711540222168, "rewards/margins": 0.22867155075073242, "rewards/rejected": -1.6108427047729492, "sft_loss": 1.3557847738265991, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 9.245630706009585, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.09205770492553711, "logits/rejected": 0.08740700036287308, "logps/chosen": -1.3065885305404663, "logps/rejected": -1.5755001306533813, "loss": 1.037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3065885305404663, "rewards/margins": 0.2689115107059479, "rewards/rejected": -1.5755001306533813, "sft_loss": 1.3555333614349365, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 9.24164649587981, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.037065595388412476, "logits/rejected": 0.12826170027256012, "logps/chosen": -1.3346545696258545, "logps/rejected": -1.7283185720443726, "loss": 1.0112, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3346545696258545, "rewards/margins": 0.3936638832092285, "rewards/rejected": -1.7283185720443726, "sft_loss": 1.3487634658813477, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 5.61559058889292, "learning_rate": 9.089646803833588e-07, "logits/chosen": 0.0015423030126839876, "logits/rejected": 0.17759868502616882, "logps/chosen": -1.3313993215560913, "logps/rejected": -1.568703293800354, "loss": 1.082, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3313993215560913, "rewards/margins": 0.23730382323265076, "rewards/rejected": -1.568703293800354, "sft_loss": 1.3770296573638916, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 7.34705254818011, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.17329472303390503, "logits/rejected": 0.10016246140003204, "logps/chosen": -1.3927847146987915, "logps/rejected": -1.686934232711792, "loss": 1.0866, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3927847146987915, "rewards/margins": 0.2941496670246124, "rewards/rejected": -1.686934232711792, "sft_loss": 1.4320820569992065, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 4.983173298606035, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.1340692788362503, "logits/rejected": 0.1313193291425705, "logps/chosen": -1.4077918529510498, "logps/rejected": -1.7832787036895752, "loss": 1.0661, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4077918529510498, "rewards/margins": 0.3754867911338806, "rewards/rejected": -1.7832787036895752, "sft_loss": 1.374353289604187, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 7.815769239132045, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.014563229866325855, "logits/rejected": 0.053143925964832306, "logps/chosen": -1.4143667221069336, "logps/rejected": -1.6859022378921509, "loss": 1.0941, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4143667221069336, "rewards/margins": 0.27153539657592773, "rewards/rejected": -1.6859022378921509, "sft_loss": 1.4450477361679077, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 5.775345808606975, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.015811914578080177, "logits/rejected": 0.08232811838388443, "logps/chosen": -1.4326014518737793, "logps/rejected": -1.661134958267212, "loss": 1.1093, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4326014518737793, "rewards/margins": 0.2285335510969162, "rewards/rejected": -1.661134958267212, "sft_loss": 1.456074595451355, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 10.144975569245078, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.018102655187249184, "logits/rejected": 0.03663431853055954, "logps/chosen": -1.3111892938613892, "logps/rejected": -1.5250718593597412, "loss": 1.0807, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3111892938613892, "rewards/margins": 0.21388253569602966, "rewards/rejected": -1.5250718593597412, "sft_loss": 1.3417662382125854, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 5.937690748680565, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.030872244387865067, "logits/rejected": 0.08035401999950409, "logps/chosen": -1.3295609951019287, "logps/rejected": -1.500370740890503, "loss": 1.0938, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3295609951019287, "rewards/margins": 0.1708097904920578, "rewards/rejected": -1.500370740890503, "sft_loss": 1.3526275157928467, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 7.6310828764205985, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.02246050536632538, "logits/rejected": 0.27992087602615356, "logps/chosen": -1.341638207435608, "logps/rejected": -1.6132581233978271, "loss": 1.0564, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.341638207435608, "rewards/margins": 0.2716197371482849, "rewards/rejected": -1.6132581233978271, "sft_loss": 1.3668334484100342, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 4.038509966910056, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.022608067840337753, "logits/rejected": 0.14372625946998596, "logps/chosen": -1.2611603736877441, "logps/rejected": -1.6419155597686768, "loss": 0.99, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2611603736877441, "rewards/margins": 0.38075512647628784, "rewards/rejected": -1.6419155597686768, "sft_loss": 1.2734200954437256, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 6.569454422734971, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.08741948008537292, "logits/rejected": 0.009515544399619102, "logps/chosen": -1.2867238521575928, "logps/rejected": -1.4618428945541382, "loss": 1.0808, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2867238521575928, "rewards/margins": 0.1751190721988678, "rewards/rejected": -1.4618428945541382, "sft_loss": 1.3573739528656006, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 9.212700327011175, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.0024367093574255705, "logits/rejected": 0.08118894696235657, "logps/chosen": -1.4251039028167725, "logps/rejected": -1.5358526706695557, "loss": 1.1365, "rewards/accuracies": 0.5, "rewards/chosen": -1.4251039028167725, "rewards/margins": 0.11074896156787872, "rewards/rejected": -1.5358526706695557, "sft_loss": 1.4242948293685913, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 4.203891282882451, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.0969192385673523, "logits/rejected": 0.11570189148187637, "logps/chosen": -1.3715673685073853, "logps/rejected": -1.714408278465271, "loss": 1.0489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3715673685073853, "rewards/margins": 0.3428409993648529, "rewards/rejected": -1.714408278465271, "sft_loss": 1.3946869373321533, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.329307496547699, "eval_logits/rejected": 0.429740846157074, "eval_logps/chosen": -1.351759910583496, "eval_logps/rejected": -1.6476715803146362, "eval_loss": 1.0554943084716797, "eval_rewards/accuracies": 0.5905044674873352, "eval_rewards/chosen": -1.351759910583496, "eval_rewards/margins": 0.295911580324173, "eval_rewards/rejected": -1.6476715803146362, "eval_runtime": 43.4771, "eval_samples_per_second": 30.936, "eval_sft_loss": 1.3766752481460571, "eval_steps_per_second": 7.751, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 5.530882088518943, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.12066706269979477, "logits/rejected": 0.10889676958322525, "logps/chosen": -1.3122395277023315, "logps/rejected": -1.6446495056152344, "loss": 1.0359, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3122395277023315, "rewards/margins": 0.33240991830825806, "rewards/rejected": -1.6446495056152344, "sft_loss": 1.3601765632629395, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 5.129344765895249, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.15109679102897644, "logits/rejected": 0.033114198595285416, "logps/chosen": -1.2982641458511353, "logps/rejected": -1.697427749633789, "loss": 1.0177, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2982641458511353, "rewards/margins": 0.3991636335849762, "rewards/rejected": -1.697427749633789, "sft_loss": 1.3306095600128174, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 6.609932526409894, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.0883597657084465, "logits/rejected": 0.02439655363559723, "logps/chosen": -1.4022852182388306, "logps/rejected": -1.7770626544952393, "loss": 1.0731, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4022852182388306, "rewards/margins": 0.3747774064540863, "rewards/rejected": -1.7770626544952393, "sft_loss": 1.4265952110290527, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 6.147812997346706, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.04175793007016182, "logits/rejected": 0.10314915329217911, "logps/chosen": -1.27086341381073, "logps/rejected": -1.6761993169784546, "loss": 1.0046, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.27086341381073, "rewards/margins": 0.4053359627723694, "rewards/rejected": -1.6761993169784546, "sft_loss": 1.283146619796753, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 7.6358092026569375, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.01520055253058672, "logits/rejected": 0.06274382770061493, "logps/chosen": -1.4354435205459595, "logps/rejected": -1.6466639041900635, "loss": 1.099, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4354435205459595, "rewards/margins": 0.21122050285339355, "rewards/rejected": -1.6466639041900635, "sft_loss": 1.3758140802383423, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 7.221964111745881, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.07788058370351791, "logits/rejected": 0.07453887164592743, "logps/chosen": -1.3857173919677734, "logps/rejected": -1.6590982675552368, "loss": 1.0866, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3857173919677734, "rewards/margins": 0.27338093519210815, "rewards/rejected": -1.6590982675552368, "sft_loss": 1.3786849975585938, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 6.640721192897655, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.14042620360851288, "logits/rejected": -0.017853520810604095, "logps/chosen": -1.2752376794815063, "logps/rejected": -1.6433387994766235, "loss": 1.0192, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2752376794815063, "rewards/margins": 0.3681010603904724, "rewards/rejected": -1.6433387994766235, "sft_loss": 1.320291519165039, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 9.724216699659413, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.1258830577135086, "logits/rejected": -0.0012362360721454024, "logps/chosen": -1.2984638214111328, "logps/rejected": -1.6937593221664429, "loss": 1.029, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2984638214111328, "rewards/margins": 0.39529532194137573, "rewards/rejected": -1.6937593221664429, "sft_loss": 1.322912335395813, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 7.377517402484911, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.07669184356927872, "logits/rejected": -0.016088414937257767, "logps/chosen": -1.2998298406600952, "logps/rejected": -1.537240982055664, "loss": 1.0434, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2998298406600952, "rewards/margins": 0.2374109923839569, "rewards/rejected": -1.537240982055664, "sft_loss": 1.3199104070663452, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 8.256103406403513, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.020514000207185745, "logits/rejected": 0.09050575643777847, "logps/chosen": -1.3168718814849854, "logps/rejected": -1.610409140586853, "loss": 1.0248, "rewards/accuracies": 0.625, "rewards/chosen": -1.3168718814849854, "rewards/margins": 0.2935372591018677, "rewards/rejected": -1.610409140586853, "sft_loss": 1.3246101140975952, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 6.034861988633968, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.08695786446332932, "logits/rejected": 0.02888108417391777, "logps/chosen": -1.3646390438079834, "logps/rejected": -1.6218284368515015, "loss": 1.0699, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3646390438079834, "rewards/margins": 0.2571892738342285, "rewards/rejected": -1.6218284368515015, "sft_loss": 1.3555657863616943, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 5.876243524685892, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.023615438491106033, "logits/rejected": -0.002601534128189087, "logps/chosen": -1.2742602825164795, "logps/rejected": -1.5528959035873413, "loss": 1.0223, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2742602825164795, "rewards/margins": 0.2786356806755066, "rewards/rejected": -1.5528959035873413, "sft_loss": 1.2252178192138672, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 4.336071573444084, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.02723364531993866, "logits/rejected": 0.04549012333154678, "logps/chosen": -1.3814964294433594, "logps/rejected": -1.5734784603118896, "loss": 1.1007, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3814964294433594, "rewards/margins": 0.19198183715343475, "rewards/rejected": -1.5734784603118896, "sft_loss": 1.3701436519622803, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 5.367175487319009, "learning_rate": 8.853479363438342e-07, "logits/chosen": 0.01956319436430931, "logits/rejected": 0.20048554241657257, "logps/chosen": -1.4105165004730225, "logps/rejected": -1.5900005102157593, "loss": 1.1235, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4105165004730225, "rewards/margins": 0.17948417365550995, "rewards/rejected": -1.5900005102157593, "sft_loss": 1.3565194606781006, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 5.722057114528683, "learning_rate": 8.843536867202588e-07, "logits/chosen": 0.008368739858269691, "logits/rejected": 0.22673054039478302, "logps/chosen": -1.3704248666763306, "logps/rejected": -1.751349687576294, "loss": 1.0563, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3704248666763306, "rewards/margins": 0.38092485070228577, "rewards/rejected": -1.751349687576294, "sft_loss": 1.4024537801742554, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 7.455299392555562, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.07408158481121063, "logits/rejected": 0.05479319021105766, "logps/chosen": -1.388621211051941, "logps/rejected": -1.6247622966766357, "loss": 1.0734, "rewards/accuracies": 0.59375, "rewards/chosen": -1.388621211051941, "rewards/margins": 0.23614096641540527, "rewards/rejected": -1.6247622966766357, "sft_loss": 1.3891102075576782, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 8.678480463656214, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.06411705911159515, "logits/rejected": 0.18151408433914185, "logps/chosen": -1.3492683172225952, "logps/rejected": -1.6280282735824585, "loss": 1.0863, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3492683172225952, "rewards/margins": 0.27876001596450806, "rewards/rejected": -1.6280282735824585, "sft_loss": 1.3718981742858887, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 5.53415486038033, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.0433022603392601, "logits/rejected": 0.164881631731987, "logps/chosen": -1.2780497074127197, "logps/rejected": -1.584289789199829, "loss": 1.0377, "rewards/accuracies": 0.625, "rewards/chosen": -1.2780497074127197, "rewards/margins": 0.30624014139175415, "rewards/rejected": -1.584289789199829, "sft_loss": 1.3719570636749268, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 11.890457950281, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.1934044361114502, "logits/rejected": -0.035475775599479675, "logps/chosen": -1.385246992111206, "logps/rejected": -1.6005245447158813, "loss": 1.1158, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.385246992111206, "rewards/margins": 0.21527759730815887, "rewards/rejected": -1.6005245447158813, "sft_loss": 1.4564180374145508, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 7.191009316057824, "learning_rate": 8.793266977736342e-07, "logits/chosen": 0.0019111812580376863, "logits/rejected": -0.04607079178094864, "logps/chosen": -1.3749603033065796, "logps/rejected": -1.485594630241394, "loss": 1.1236, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3749603033065796, "rewards/margins": 0.11063437163829803, "rewards/rejected": -1.485594630241394, "sft_loss": 1.397199273109436, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 8.97809173868377, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.006315511651337147, "logits/rejected": 0.14795732498168945, "logps/chosen": -1.3420408964157104, "logps/rejected": -1.511926293373108, "loss": 1.0932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3420408964157104, "rewards/margins": 0.16988542675971985, "rewards/rejected": -1.511926293373108, "sft_loss": 1.3750202655792236, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 5.8200911103522115, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.030334660783410072, "logits/rejected": 0.06693456321954727, "logps/chosen": -1.3011609315872192, "logps/rejected": -1.5238155126571655, "loss": 1.0611, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3011609315872192, "rewards/margins": 0.22265465557575226, "rewards/rejected": -1.5238155126571655, "sft_loss": 1.3497133255004883, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 9.054622362148715, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.02639349177479744, "logits/rejected": 0.19740860164165497, "logps/chosen": -1.40165114402771, "logps/rejected": -1.5750657320022583, "loss": 1.1327, "rewards/accuracies": 0.53125, "rewards/chosen": -1.40165114402771, "rewards/margins": 0.17341431975364685, "rewards/rejected": -1.5750657320022583, "sft_loss": 1.38827383518219, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 12.320116491659183, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.03263465687632561, "logits/rejected": 0.03770837560296059, "logps/chosen": -1.2219219207763672, "logps/rejected": -1.6054325103759766, "loss": 1.0056, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2219219207763672, "rewards/margins": 0.3835105299949646, "rewards/rejected": -1.6054325103759766, "sft_loss": 1.2506580352783203, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 5.4753550125906445, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.03400716930627823, "logits/rejected": 0.08444056659936905, "logps/chosen": -1.3580182790756226, "logps/rejected": -1.7344977855682373, "loss": 1.0558, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3580182790756226, "rewards/margins": 0.3764795660972595, "rewards/rejected": -1.7344977855682373, "sft_loss": 1.4085122346878052, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 6.076706383744978, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.06157577782869339, "logits/rejected": 0.11648639291524887, "logps/chosen": -1.3367359638214111, "logps/rejected": -1.5042455196380615, "loss": 1.0917, "rewards/accuracies": 0.5, "rewards/chosen": -1.3367359638214111, "rewards/margins": 0.1675097644329071, "rewards/rejected": -1.5042455196380615, "sft_loss": 1.34554922580719, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 6.843318001545579, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.1242741122841835, "logits/rejected": 0.05887087434530258, "logps/chosen": -1.2805627584457397, "logps/rejected": -1.8210529088974, "loss": 0.9578, "rewards/accuracies": 0.625, "rewards/chosen": -1.2805627584457397, "rewards/margins": 0.5404902696609497, "rewards/rejected": -1.8210529088974, "sft_loss": 1.2435736656188965, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 7.6052098654695035, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.05907027795910835, "logits/rejected": 0.09855295717716217, "logps/chosen": -1.3115413188934326, "logps/rejected": -1.587019920349121, "loss": 1.0278, "rewards/accuracies": 0.625, "rewards/chosen": -1.3115413188934326, "rewards/margins": 0.2754787802696228, "rewards/rejected": -1.587019920349121, "sft_loss": 1.2654893398284912, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 4.790584119290133, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.022471506148576736, "logits/rejected": 0.05765972658991814, "logps/chosen": -1.3353490829467773, "logps/rejected": -1.5706102848052979, "loss": 1.0756, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3353490829467773, "rewards/margins": 0.23526112735271454, "rewards/rejected": -1.5706102848052979, "sft_loss": 1.3784520626068115, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 13.045781580257584, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.02450602874159813, "logits/rejected": 0.044948406517505646, "logps/chosen": -1.360719084739685, "logps/rejected": -1.6785328388214111, "loss": 1.0467, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.360719084739685, "rewards/margins": 0.31781378388404846, "rewards/rejected": -1.6785328388214111, "sft_loss": 1.3535664081573486, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 5.408447209248505, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.031846821308135986, "logits/rejected": 0.07990224659442902, "logps/chosen": -1.2945154905319214, "logps/rejected": -1.5294058322906494, "loss": 1.0546, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2945154905319214, "rewards/margins": 0.23489025235176086, "rewards/rejected": -1.5294058322906494, "sft_loss": 1.3085170984268188, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 6.636071157387697, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.030917812138795853, "logits/rejected": 0.10335756838321686, "logps/chosen": -1.2730026245117188, "logps/rejected": -1.5577722787857056, "loss": 1.0495, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2730026245117188, "rewards/margins": 0.2847694754600525, "rewards/rejected": -1.5577722787857056, "sft_loss": 1.360364556312561, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 7.336694066056145, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.005440413951873779, "logits/rejected": 0.030295390635728836, "logps/chosen": -1.3073769807815552, "logps/rejected": -1.6339219808578491, "loss": 1.0344, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3073769807815552, "rewards/margins": 0.3265449106693268, "rewards/rejected": -1.6339219808578491, "sft_loss": 1.3142335414886475, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 5.013746797596467, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.0021407068707048893, "logits/rejected": 0.15958009660243988, "logps/chosen": -1.3515747785568237, "logps/rejected": -1.513629674911499, "loss": 1.0904, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3515747785568237, "rewards/margins": 0.1620550900697708, "rewards/rejected": -1.513629674911499, "sft_loss": 1.400733232498169, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 7.874936122712779, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.059729017317295074, "logits/rejected": 0.04206446558237076, "logps/chosen": -1.2914938926696777, "logps/rejected": -1.564706563949585, "loss": 1.0404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2914938926696777, "rewards/margins": 0.2732127010822296, "rewards/rejected": -1.564706563949585, "sft_loss": 1.3349140882492065, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 12.882113345243505, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.06150873750448227, "logits/rejected": 0.07276951521635056, "logps/chosen": -1.3248833417892456, "logps/rejected": -1.6360422372817993, "loss": 1.0316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3248833417892456, "rewards/margins": 0.3111588656902313, "rewards/rejected": -1.6360422372817993, "sft_loss": 1.2923238277435303, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 8.01846677038553, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.030636975541710854, "logits/rejected": 0.10803890228271484, "logps/chosen": -1.280662178993225, "logps/rejected": -1.4428179264068604, "loss": 1.057, "rewards/accuracies": 0.5625, "rewards/chosen": -1.280662178993225, "rewards/margins": 0.1621556282043457, "rewards/rejected": -1.4428179264068604, "sft_loss": 1.281178593635559, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 6.808143996882543, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.0560939684510231, "logits/rejected": 0.10724347829818726, "logps/chosen": -1.343372106552124, "logps/rejected": -1.5355207920074463, "loss": 1.0812, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.343372106552124, "rewards/margins": 0.19214865565299988, "rewards/rejected": -1.5355207920074463, "sft_loss": 1.3805100917816162, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 5.031018144550673, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.10228991508483887, "logits/rejected": 0.04363972693681717, "logps/chosen": -1.2929900884628296, "logps/rejected": -1.5646179914474487, "loss": 1.0514, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2929900884628296, "rewards/margins": 0.2716279923915863, "rewards/rejected": -1.5646179914474487, "sft_loss": 1.345216989517212, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 5.282991735343984, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.04971680790185928, "logits/rejected": 0.031308285892009735, "logps/chosen": -1.2953236103057861, "logps/rejected": -1.4893214702606201, "loss": 1.0824, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2953236103057861, "rewards/margins": 0.19399778544902802, "rewards/rejected": -1.4893214702606201, "sft_loss": 1.3761422634124756, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 5.142929280960286, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.18090695142745972, "logits/rejected": -0.04280409589409828, "logps/chosen": -1.2623097896575928, "logps/rejected": -1.640331506729126, "loss": 1.0059, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2623097896575928, "rewards/margins": 0.3780217170715332, "rewards/rejected": -1.640331506729126, "sft_loss": 1.3180025815963745, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 6.015219080547802, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.04094986245036125, "logits/rejected": 0.10819850116968155, "logps/chosen": -1.3020027875900269, "logps/rejected": -1.5312139987945557, "loss": 1.0636, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3020027875900269, "rewards/margins": 0.22921118140220642, "rewards/rejected": -1.5312139987945557, "sft_loss": 1.3365328311920166, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 6.2344472094409635, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.07465235143899918, "logits/rejected": 0.07446275651454926, "logps/chosen": -1.3151795864105225, "logps/rejected": -1.5252140760421753, "loss": 1.0723, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3151795864105225, "rewards/margins": 0.21003445982933044, "rewards/rejected": -1.5252140760421753, "sft_loss": 1.3637964725494385, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 9.360317087259874, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.03616784140467644, "logits/rejected": 0.03897800296545029, "logps/chosen": -1.3293553590774536, "logps/rejected": -1.5339654684066772, "loss": 1.0521, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3293553590774536, "rewards/margins": 0.20461025834083557, "rewards/rejected": -1.5339654684066772, "sft_loss": 1.312124252319336, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 8.443872063306976, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.1401108205318451, "logits/rejected": 0.015832537785172462, "logps/chosen": -1.2916826009750366, "logps/rejected": -1.6708862781524658, "loss": 1.0449, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2916826009750366, "rewards/margins": 0.3792034685611725, "rewards/rejected": -1.6708862781524658, "sft_loss": 1.3466923236846924, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 5.552897718500467, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.07638445496559143, "logits/rejected": 0.1197354644536972, "logps/chosen": -1.3811604976654053, "logps/rejected": -1.518565058708191, "loss": 1.102, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3811604976654053, "rewards/margins": 0.13740459084510803, "rewards/rejected": -1.518565058708191, "sft_loss": 1.3802618980407715, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 6.599402994951173, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.060695432126522064, "logits/rejected": 0.014427835121750832, "logps/chosen": -1.3435875177383423, "logps/rejected": -1.4622777700424194, "loss": 1.1121, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3435875177383423, "rewards/margins": 0.11869029700756073, "rewards/rejected": -1.4622777700424194, "sft_loss": 1.3856557607650757, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 6.3743422415046345, "learning_rate": 8.495110657042488e-07, "logits/chosen": 0.006036204285919666, "logits/rejected": 0.12875476479530334, "logps/chosen": -1.3412091732025146, "logps/rejected": -1.6945394277572632, "loss": 1.025, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3412091732025146, "rewards/margins": 0.3533302843570709, "rewards/rejected": -1.6945394277572632, "sft_loss": 1.362762212753296, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 8.105021438155644, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.05386769026517868, "logits/rejected": 0.04598368704319, "logps/chosen": -1.3784972429275513, "logps/rejected": -1.6016931533813477, "loss": 1.1197, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3784972429275513, "rewards/margins": 0.22319582104682922, "rewards/rejected": -1.6016931533813477, "sft_loss": 1.4001914262771606, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 10.38510756238131, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.20087404549121857, "logits/rejected": -0.05395135283470154, "logps/chosen": -1.402502417564392, "logps/rejected": -1.540789008140564, "loss": 1.1224, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.402502417564392, "rewards/margins": 0.13828660547733307, "rewards/rejected": -1.540789008140564, "sft_loss": 1.3710534572601318, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 9.39666350747347, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.07947036623954773, "logits/rejected": 0.03734710440039635, "logps/chosen": -1.3037769794464111, "logps/rejected": -1.5832937955856323, "loss": 1.0464, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3037769794464111, "rewards/margins": 0.2795167565345764, "rewards/rejected": -1.5832937955856323, "sft_loss": 1.3439761400222778, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 6.725630926196394, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.04184072092175484, "logits/rejected": 0.049746450036764145, "logps/chosen": -1.4166171550750732, "logps/rejected": -1.637763261795044, "loss": 1.0816, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4166171550750732, "rewards/margins": 0.22114595770835876, "rewards/rejected": -1.637763261795044, "sft_loss": 1.3935834169387817, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 6.224624107660037, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.07246112823486328, "logits/rejected": 0.014533983543515205, "logps/chosen": -1.2703287601470947, "logps/rejected": -1.5969499349594116, "loss": 1.021, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2703287601470947, "rewards/margins": 0.3266211450099945, "rewards/rejected": -1.5969499349594116, "sft_loss": 1.3040375709533691, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 8.211500198907471, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.14373692870140076, "logits/rejected": 0.017202334478497505, "logps/chosen": -1.3954145908355713, "logps/rejected": -1.6238315105438232, "loss": 1.0964, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3954145908355713, "rewards/margins": 0.22841675579547882, "rewards/rejected": -1.6238315105438232, "sft_loss": 1.4205118417739868, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 7.784393754102794, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.1777457594871521, "logits/rejected": -0.03787863254547119, "logps/chosen": -1.380475640296936, "logps/rejected": -1.5722134113311768, "loss": 1.0659, "rewards/accuracies": 0.53125, "rewards/chosen": -1.380475640296936, "rewards/margins": 0.1917375922203064, "rewards/rejected": -1.5722134113311768, "sft_loss": 1.3126857280731201, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 9.483278737279292, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.04560353606939316, "logits/rejected": 0.127095028758049, "logps/chosen": -1.3196014165878296, "logps/rejected": -1.7367639541625977, "loss": 1.0283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3196014165878296, "rewards/margins": 0.4171624183654785, "rewards/rejected": -1.7367639541625977, "sft_loss": 1.3578262329101562, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 6.936539864471406, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.10250772535800934, "logits/rejected": -0.022584009915590286, "logps/chosen": -1.2747838497161865, "logps/rejected": -1.587401032447815, "loss": 1.0299, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2747838497161865, "rewards/margins": 0.31261715292930603, "rewards/rejected": -1.587401032447815, "sft_loss": 1.2519252300262451, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 5.619838750688994, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.11007682234048843, "logits/rejected": 0.08966507017612457, "logps/chosen": -1.3074545860290527, "logps/rejected": -1.5856674909591675, "loss": 1.05, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3074545860290527, "rewards/margins": 0.2782130241394043, "rewards/rejected": -1.5856674909591675, "sft_loss": 1.386627197265625, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 5.740860945772739, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.042826805263757706, "logits/rejected": 0.01792163774371147, "logps/chosen": -1.3253368139266968, "logps/rejected": -1.5498539209365845, "loss": 1.0328, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3253368139266968, "rewards/margins": 0.2245168685913086, "rewards/rejected": -1.5498539209365845, "sft_loss": 1.3255261182785034, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 5.0922788595642, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1649695336818695, "logits/rejected": 0.1055021733045578, "logps/chosen": -1.3374435901641846, "logps/rejected": -1.5897200107574463, "loss": 1.0576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3374435901641846, "rewards/margins": 0.25227636098861694, "rewards/rejected": -1.5897200107574463, "sft_loss": 1.3770906925201416, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 3.926305418162864, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.08125348389148712, "logits/rejected": 0.07250069081783295, "logps/chosen": -1.3621914386749268, "logps/rejected": -1.594728708267212, "loss": 1.0676, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3621914386749268, "rewards/margins": 0.2325374186038971, "rewards/rejected": -1.594728708267212, "sft_loss": 1.352673888206482, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 6.780850327708814, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.17982670664787292, "logits/rejected": -0.05172845721244812, "logps/chosen": -1.3353796005249023, "logps/rejected": -1.570927381515503, "loss": 1.0505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3353796005249023, "rewards/margins": 0.23554787039756775, "rewards/rejected": -1.570927381515503, "sft_loss": 1.3366186618804932, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 4.888021467437801, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.10884324461221695, "logits/rejected": -0.02582775428891182, "logps/chosen": -1.3080412149429321, "logps/rejected": -1.5268137454986572, "loss": 1.076, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3080412149429321, "rewards/margins": 0.2187725007534027, "rewards/rejected": -1.5268137454986572, "sft_loss": 1.3675590753555298, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 5.630214545738198, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.12240447849035263, "logits/rejected": -0.039300721138715744, "logps/chosen": -1.2059236764907837, "logps/rejected": -1.545480728149414, "loss": 0.9855, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2059236764907837, "rewards/margins": 0.3395571708679199, "rewards/rejected": -1.545480728149414, "sft_loss": 1.258929967880249, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 7.493618244933485, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2451147735118866, "logits/rejected": -0.020291466265916824, "logps/chosen": -1.375449299812317, "logps/rejected": -1.613504409790039, "loss": 1.0801, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.375449299812317, "rewards/margins": 0.23805518448352814, "rewards/rejected": -1.613504409790039, "sft_loss": 1.4104481935501099, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 7.361962075932273, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.1329573094844818, "logits/rejected": 0.01702733151614666, "logps/chosen": -1.2550948858261108, "logps/rejected": -1.5974655151367188, "loss": 0.987, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2550948858261108, "rewards/margins": 0.3423704504966736, "rewards/rejected": -1.5974655151367188, "sft_loss": 1.2982847690582275, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 8.331882985469626, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.17175039649009705, "logits/rejected": 0.02810470387339592, "logps/chosen": -1.3440721035003662, "logps/rejected": -1.7161515951156616, "loss": 1.0069, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3440721035003662, "rewards/margins": 0.3720795810222626, "rewards/rejected": -1.7161515951156616, "sft_loss": 1.3601570129394531, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 6.441709503588563, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.08347249776124954, "logits/rejected": 0.03723754733800888, "logps/chosen": -1.3489030599594116, "logps/rejected": -1.716583013534546, "loss": 1.0284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3489030599594116, "rewards/margins": 0.36768001317977905, "rewards/rejected": -1.716583013534546, "sft_loss": 1.363987684249878, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 4.881873706559672, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.18445467948913574, "logits/rejected": -0.005650246050208807, "logps/chosen": -1.3698749542236328, "logps/rejected": -1.8684381246566772, "loss": 1.0048, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3698749542236328, "rewards/margins": 0.498563289642334, "rewards/rejected": -1.8684381246566772, "sft_loss": 1.3930516242980957, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 6.394072611705559, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.08021242916584015, "logits/rejected": 0.1560056209564209, "logps/chosen": -1.4112555980682373, "logps/rejected": -1.7629226446151733, "loss": 1.0666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4112555980682373, "rewards/margins": 0.35166722536087036, "rewards/rejected": -1.7629226446151733, "sft_loss": 1.4502036571502686, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 5.268966441319738, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.09227783232927322, "logits/rejected": -0.02726762369275093, "logps/chosen": -1.2625749111175537, "logps/rejected": -1.6419893503189087, "loss": 0.98, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2625749111175537, "rewards/margins": 0.3794143795967102, "rewards/rejected": -1.6419893503189087, "sft_loss": 1.28301203250885, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 7.312813219261093, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.14062745869159698, "logits/rejected": -0.10793910175561905, "logps/chosen": -1.3452221155166626, "logps/rejected": -1.6348785161972046, "loss": 1.0561, "rewards/accuracies": 0.625, "rewards/chosen": -1.3452221155166626, "rewards/margins": 0.28965651988983154, "rewards/rejected": -1.6348785161972046, "sft_loss": 1.4023061990737915, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 6.6596167436143565, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.057622771710157394, "logits/rejected": 0.018117045983672142, "logps/chosen": -1.2645435333251953, "logps/rejected": -1.7956523895263672, "loss": 0.9724, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2645435333251953, "rewards/margins": 0.5311091542243958, "rewards/rejected": -1.7956523895263672, "sft_loss": 1.31093430519104, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 5.363156650986811, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.1434890478849411, "logits/rejected": 0.008483712561428547, "logps/chosen": -1.3194637298583984, "logps/rejected": -1.646680235862732, "loss": 1.034, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3194637298583984, "rewards/margins": 0.3272164762020111, "rewards/rejected": -1.646680235862732, "sft_loss": 1.3527896404266357, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 7.53618794053218, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.06295988708734512, "logits/rejected": 0.03186292201280594, "logps/chosen": -1.2528502941131592, "logps/rejected": -1.591352939605713, "loss": 1.0003, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2528502941131592, "rewards/margins": 0.3385026752948761, "rewards/rejected": -1.591352939605713, "sft_loss": 1.3003718852996826, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 6.419840363438428, "learning_rate": 8.170386052085389e-07, "logits/chosen": 0.008228405378758907, "logits/rejected": 0.13828298449516296, "logps/chosen": -1.3528460264205933, "logps/rejected": -1.6920034885406494, "loss": 1.0396, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3528460264205933, "rewards/margins": 0.3391575813293457, "rewards/rejected": -1.6920034885406494, "sft_loss": 1.3606065511703491, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 7.530584399794697, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.08242753148078918, "logits/rejected": 0.08598792552947998, "logps/chosen": -1.3864794969558716, "logps/rejected": -1.6212832927703857, "loss": 1.0795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3864794969558716, "rewards/margins": 0.23480382561683655, "rewards/rejected": -1.6212832927703857, "sft_loss": 1.3649652004241943, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 7.789921643519763, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.1279408484697342, "logits/rejected": -0.02753205969929695, "logps/chosen": -1.3840112686157227, "logps/rejected": -1.6482646465301514, "loss": 1.0611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3840112686157227, "rewards/margins": 0.26425355672836304, "rewards/rejected": -1.6482646465301514, "sft_loss": 1.3844091892242432, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 5.410895902437541, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.020565593615174294, "logits/rejected": 0.135699063539505, "logps/chosen": -1.2775145769119263, "logps/rejected": -1.7521684169769287, "loss": 0.9907, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2775145769119263, "rewards/margins": 0.47465381026268005, "rewards/rejected": -1.7521684169769287, "sft_loss": 1.3165042400360107, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 10.314479472868056, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.09743030369281769, "logits/rejected": 0.06601885706186295, "logps/chosen": -1.433811068534851, "logps/rejected": -1.760123610496521, "loss": 1.1366, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.433811068534851, "rewards/margins": 0.32631251215934753, "rewards/rejected": -1.760123610496521, "sft_loss": 1.414137601852417, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.24525417387485504, "eval_logits/rejected": 0.34158626198768616, "eval_logps/chosen": -1.3555330038070679, "eval_logps/rejected": -1.7039648294448853, "eval_loss": 1.049640417098999, "eval_rewards/accuracies": 0.5986647009849548, "eval_rewards/chosen": -1.3555330038070679, "eval_rewards/margins": 0.3484318256378174, "eval_rewards/rejected": -1.7039648294448853, "eval_runtime": 43.3282, "eval_samples_per_second": 31.042, "eval_sft_loss": 1.3798303604125977, "eval_steps_per_second": 7.778, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 7.411979260880644, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.051903557032346725, "logits/rejected": 0.03727956861257553, "logps/chosen": -1.3741095066070557, "logps/rejected": -1.613830804824829, "loss": 1.0907, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3741095066070557, "rewards/margins": 0.23972125351428986, "rewards/rejected": -1.613830804824829, "sft_loss": 1.402032494544983, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 8.070388340931352, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.002574050333350897, "logits/rejected": 0.11509355157613754, "logps/chosen": -1.2788218259811401, "logps/rejected": -1.5739259719848633, "loss": 1.037, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2788218259811401, "rewards/margins": 0.29510411620140076, "rewards/rejected": -1.5739259719848633, "sft_loss": 1.3477932214736938, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.782166764980172, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.03715590387582779, "logits/rejected": 0.12355498969554901, "logps/chosen": -1.2872711420059204, "logps/rejected": -1.597229242324829, "loss": 1.0119, "rewards/accuracies": 0.625, "rewards/chosen": -1.2872711420059204, "rewards/margins": 0.30995815992355347, "rewards/rejected": -1.597229242324829, "sft_loss": 1.2761762142181396, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 6.363280249836633, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.11921674013137817, "logits/rejected": 0.08086469024419785, "logps/chosen": -1.3570458889007568, "logps/rejected": -1.6779505014419556, "loss": 1.0451, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3570458889007568, "rewards/margins": 0.32090455293655396, "rewards/rejected": -1.6779505014419556, "sft_loss": 1.3510466814041138, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 5.666752186518074, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.019246716052293777, "logits/rejected": 0.12416522204875946, "logps/chosen": -1.3521109819412231, "logps/rejected": -1.7478277683258057, "loss": 1.0479, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3521109819412231, "rewards/margins": 0.39571696519851685, "rewards/rejected": -1.7478277683258057, "sft_loss": 1.402245283126831, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 6.035031708124639, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.08087825030088425, "logits/rejected": -0.054591964930295944, "logps/chosen": -1.294004201889038, "logps/rejected": -1.542240858078003, "loss": 1.0639, "rewards/accuracies": 0.59375, "rewards/chosen": -1.294004201889038, "rewards/margins": 0.24823662638664246, "rewards/rejected": -1.542240858078003, "sft_loss": 1.3424246311187744, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 10.063671915106058, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.11445492506027222, "logits/rejected": 0.06764774024486542, "logps/chosen": -1.3052616119384766, "logps/rejected": -1.6242589950561523, "loss": 1.05, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3052616119384766, "rewards/margins": 0.31899750232696533, "rewards/rejected": -1.6242589950561523, "sft_loss": 1.3409321308135986, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 30.912182108309786, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.16330935060977936, "logits/rejected": -0.08893314749002457, "logps/chosen": -1.3520915508270264, "logps/rejected": -1.588045597076416, "loss": 1.1011, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3520915508270264, "rewards/margins": 0.23595380783081055, "rewards/rejected": -1.588045597076416, "sft_loss": 1.399583101272583, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 8.051710900963842, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.028850510716438293, "logits/rejected": 0.020016059279441833, "logps/chosen": -1.3128397464752197, "logps/rejected": -1.6394424438476562, "loss": 1.0122, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3128397464752197, "rewards/margins": 0.3266026973724365, "rewards/rejected": -1.6394424438476562, "sft_loss": 1.2767457962036133, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 6.283046847276603, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.1665606051683426, "logits/rejected": -0.031640782952308655, "logps/chosen": -1.3250031471252441, "logps/rejected": -1.5790154933929443, "loss": 1.0946, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3250031471252441, "rewards/margins": 0.25401216745376587, "rewards/rejected": -1.5790154933929443, "sft_loss": 1.4127658605575562, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 7.269065772821926, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.11072566360235214, "logits/rejected": 0.017708975821733475, "logps/chosen": -1.2886245250701904, "logps/rejected": -1.5549981594085693, "loss": 1.0582, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2886245250701904, "rewards/margins": 0.26637381315231323, "rewards/rejected": -1.5549981594085693, "sft_loss": 1.3413861989974976, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 6.24349177180485, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.0787181407213211, "logits/rejected": -0.022912293672561646, "logps/chosen": -1.3490071296691895, "logps/rejected": -1.7265007495880127, "loss": 1.049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3490071296691895, "rewards/margins": 0.37749359011650085, "rewards/rejected": -1.7265007495880127, "sft_loss": 1.4076695442199707, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 6.539411262922854, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.10219565778970718, "logits/rejected": 0.02910270355641842, "logps/chosen": -1.3191393613815308, "logps/rejected": -1.5854027271270752, "loss": 1.0652, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3191393613815308, "rewards/margins": 0.26626336574554443, "rewards/rejected": -1.5854027271270752, "sft_loss": 1.3123081922531128, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 6.305327686728363, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.13566602766513824, "logits/rejected": 0.016496330499649048, "logps/chosen": -1.2557474374771118, "logps/rejected": -1.6477458477020264, "loss": 0.9846, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2557474374771118, "rewards/margins": 0.3919984698295593, "rewards/rejected": -1.6477458477020264, "sft_loss": 1.2934027910232544, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 13.694157376075486, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.1433573067188263, "logits/rejected": -0.038366906344890594, "logps/chosen": -1.3339672088623047, "logps/rejected": -1.6908581256866455, "loss": 1.0569, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3339672088623047, "rewards/margins": 0.3568907380104065, "rewards/rejected": -1.6908581256866455, "sft_loss": 1.4096620082855225, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 5.398515345858442, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.19307354092597961, "logits/rejected": -0.03149569779634476, "logps/chosen": -1.2381755113601685, "logps/rejected": -1.6369307041168213, "loss": 0.9736, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2381755113601685, "rewards/margins": 0.3987550735473633, "rewards/rejected": -1.6369307041168213, "sft_loss": 1.2880828380584717, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 12.257195395456149, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.1564318835735321, "logits/rejected": -0.039251018315553665, "logps/chosen": -1.3086373805999756, "logps/rejected": -1.677406668663025, "loss": 1.0341, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3086373805999756, "rewards/margins": 0.3687690198421478, "rewards/rejected": -1.677406668663025, "sft_loss": 1.3267216682434082, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 6.909176677099874, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.22223138809204102, "logits/rejected": -0.2099127471446991, "logps/chosen": -1.2886847257614136, "logps/rejected": -1.5642931461334229, "loss": 1.0756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2886847257614136, "rewards/margins": 0.2756083607673645, "rewards/rejected": -1.5642931461334229, "sft_loss": 1.3992526531219482, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 7.316699419141525, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.15762130916118622, "logits/rejected": -0.011286157183349133, "logps/chosen": -1.4465506076812744, "logps/rejected": -1.7105118036270142, "loss": 1.1048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4465506076812744, "rewards/margins": 0.26396113634109497, "rewards/rejected": -1.7105118036270142, "sft_loss": 1.474279761314392, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 7.1151059070629, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.04906468465924263, "logits/rejected": 0.007820269092917442, "logps/chosen": -1.450408935546875, "logps/rejected": -1.6943660974502563, "loss": 1.0934, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.450408935546875, "rewards/margins": 0.24395708739757538, "rewards/rejected": -1.6943660974502563, "sft_loss": 1.4425398111343384, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 7.058544700250131, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.027908477932214737, "logits/rejected": 0.09882084280252457, "logps/chosen": -1.3739533424377441, "logps/rejected": -1.7385390996932983, "loss": 1.0386, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3739533424377441, "rewards/margins": 0.3645857274532318, "rewards/rejected": -1.7385390996932983, "sft_loss": 1.357513189315796, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 7.232777343396231, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.15299741923809052, "logits/rejected": 0.01500866748392582, "logps/chosen": -1.3116304874420166, "logps/rejected": -1.8638451099395752, "loss": 1.0158, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3116304874420166, "rewards/margins": 0.552214503288269, "rewards/rejected": -1.8638451099395752, "sft_loss": 1.368384599685669, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 8.938851736706948, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.006882402114570141, "logits/rejected": 0.05765901878476143, "logps/chosen": -1.3367273807525635, "logps/rejected": -1.6397292613983154, "loss": 1.0679, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3367273807525635, "rewards/margins": 0.3030018210411072, "rewards/rejected": -1.6397292613983154, "sft_loss": 1.3929353952407837, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 6.702017921722495, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.12569338083267212, "logits/rejected": -0.0814434066414833, "logps/chosen": -1.3406602144241333, "logps/rejected": -1.5882494449615479, "loss": 1.0758, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3406602144241333, "rewards/margins": 0.24758926033973694, "rewards/rejected": -1.5882494449615479, "sft_loss": 1.417973518371582, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 4.990351697136921, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.08439499139785767, "logits/rejected": -0.01835174486041069, "logps/chosen": -1.2689683437347412, "logps/rejected": -1.693606972694397, "loss": 0.9787, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2689683437347412, "rewards/margins": 0.4246388375759125, "rewards/rejected": -1.693606972694397, "sft_loss": 1.3011971712112427, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 8.595033598654478, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.04252857714891434, "logits/rejected": 0.2256098985671997, "logps/chosen": -1.352736473083496, "logps/rejected": -1.7007734775543213, "loss": 1.0446, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.352736473083496, "rewards/margins": 0.34803682565689087, "rewards/rejected": -1.7007734775543213, "sft_loss": 1.407582402229309, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 5.344545744263389, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.030758550390601158, "logits/rejected": 0.15121528506278992, "logps/chosen": -1.3427588939666748, "logps/rejected": -1.7324472665786743, "loss": 1.037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3427588939666748, "rewards/margins": 0.3896884322166443, "rewards/rejected": -1.7324472665786743, "sft_loss": 1.370418906211853, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 9.515661126009993, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.0861203521490097, "logits/rejected": 0.0647224485874176, "logps/chosen": -1.3469445705413818, "logps/rejected": -1.6751673221588135, "loss": 1.0474, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3469445705413818, "rewards/margins": 0.32822278141975403, "rewards/rejected": -1.6751673221588135, "sft_loss": 1.3514187335968018, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 9.819615223092732, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.04387445002794266, "logits/rejected": 0.052936069667339325, "logps/chosen": -1.2406890392303467, "logps/rejected": -1.6722383499145508, "loss": 1.003, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2406890392303467, "rewards/margins": 0.4315493106842041, "rewards/rejected": -1.6722383499145508, "sft_loss": 1.3141372203826904, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 6.81998773618303, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.10533297061920166, "logits/rejected": -0.043056420981884, "logps/chosen": -1.3972469568252563, "logps/rejected": -1.7338556051254272, "loss": 1.0677, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3972469568252563, "rewards/margins": 0.3366088569164276, "rewards/rejected": -1.7338556051254272, "sft_loss": 1.4280717372894287, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 6.608290407818981, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.007891124114394188, "logits/rejected": 0.06537625938653946, "logps/chosen": -1.4082162380218506, "logps/rejected": -1.709684133529663, "loss": 1.083, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4082162380218506, "rewards/margins": 0.30146756768226624, "rewards/rejected": -1.709684133529663, "sft_loss": 1.4514955282211304, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 7.508397978132879, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.11488986015319824, "logits/rejected": 0.0039465115405619144, "logps/chosen": -1.3215010166168213, "logps/rejected": -1.7341926097869873, "loss": 1.0216, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3215010166168213, "rewards/margins": 0.4126916825771332, "rewards/rejected": -1.7341926097869873, "sft_loss": 1.3616695404052734, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 7.207242047125425, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.1129886656999588, "logits/rejected": 0.06885222345590591, "logps/chosen": -1.346785545349121, "logps/rejected": -1.8081716299057007, "loss": 1.0042, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.346785545349121, "rewards/margins": 0.4613862633705139, "rewards/rejected": -1.8081716299057007, "sft_loss": 1.3442741632461548, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 14.002661900284268, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.08279160410165787, "logits/rejected": 0.1021057590842247, "logps/chosen": -1.321763515472412, "logps/rejected": -1.7964341640472412, "loss": 1.0098, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.321763515472412, "rewards/margins": 0.47467073798179626, "rewards/rejected": -1.7964341640472412, "sft_loss": 1.3471057415008545, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 7.03794605195869, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.11434582620859146, "logits/rejected": 0.06666027009487152, "logps/chosen": -1.3573967218399048, "logps/rejected": -1.6420046091079712, "loss": 1.0553, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3573967218399048, "rewards/margins": 0.2846079468727112, "rewards/rejected": -1.6420046091079712, "sft_loss": 1.386859655380249, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 5.144226817708508, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.10920850187540054, "logits/rejected": -0.023228798061609268, "logps/chosen": -1.3085116147994995, "logps/rejected": -1.6747932434082031, "loss": 1.0385, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3085116147994995, "rewards/margins": 0.3662816882133484, "rewards/rejected": -1.6747932434082031, "sft_loss": 1.3602230548858643, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 8.1907000790852, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.01839374378323555, "logits/rejected": 0.13123062252998352, "logps/chosen": -1.340759515762329, "logps/rejected": -1.6185781955718994, "loss": 1.0455, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.340759515762329, "rewards/margins": 0.277818500995636, "rewards/rejected": -1.6185781955718994, "sft_loss": 1.3402440547943115, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 6.992143239090281, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.09558597952127457, "logits/rejected": 0.15679627656936646, "logps/chosen": -1.324134111404419, "logps/rejected": -1.6583322286605835, "loss": 1.0549, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.324134111404419, "rewards/margins": 0.3341982960700989, "rewards/rejected": -1.6583322286605835, "sft_loss": 1.4005115032196045, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 10.815572949506729, "learning_rate": 7.625692901064573e-07, "logits/chosen": 0.027740132063627243, "logits/rejected": 0.13389061391353607, "logps/chosen": -1.307284951210022, "logps/rejected": -1.735878348350525, "loss": 1.0106, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.307284951210022, "rewards/margins": 0.428593248128891, "rewards/rejected": -1.735878348350525, "sft_loss": 1.3475149869918823, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 6.395374339569294, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.07172278314828873, "logits/rejected": -0.0779951885342598, "logps/chosen": -1.342049241065979, "logps/rejected": -1.6955223083496094, "loss": 1.0557, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.342049241065979, "rewards/margins": 0.3534731864929199, "rewards/rejected": -1.6955223083496094, "sft_loss": 1.4067165851593018, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 4.956191208540565, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.12661337852478027, "logits/rejected": 0.07305797189474106, "logps/chosen": -1.253143548965454, "logps/rejected": -1.6444408893585205, "loss": 0.9738, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.253143548965454, "rewards/margins": 0.39129742980003357, "rewards/rejected": -1.6444408893585205, "sft_loss": 1.2474725246429443, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 7.880389938787795, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.16296562552452087, "logits/rejected": -0.0367860347032547, "logps/chosen": -1.2162091732025146, "logps/rejected": -1.6427171230316162, "loss": 0.9846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2162091732025146, "rewards/margins": 0.42650800943374634, "rewards/rejected": -1.6427171230316162, "sft_loss": 1.2575920820236206, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 9.198438837473466, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.14288155734539032, "logits/rejected": -0.05155428498983383, "logps/chosen": -1.4426788091659546, "logps/rejected": -1.8797391653060913, "loss": 1.0364, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4426788091659546, "rewards/margins": 0.4370604455471039, "rewards/rejected": -1.8797391653060913, "sft_loss": 1.391655683517456, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 7.850915911352129, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.18914887309074402, "logits/rejected": -0.05741081386804581, "logps/chosen": -1.2726414203643799, "logps/rejected": -1.6055552959442139, "loss": 1.0441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2726414203643799, "rewards/margins": 0.33291396498680115, "rewards/rejected": -1.6055552959442139, "sft_loss": 1.3326237201690674, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 7.153901114036553, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.17279231548309326, "logits/rejected": 0.04552667587995529, "logps/chosen": -1.2659995555877686, "logps/rejected": -1.68410325050354, "loss": 1.0001, "rewards/accuracies": 0.625, "rewards/chosen": -1.2659995555877686, "rewards/margins": 0.41810378432273865, "rewards/rejected": -1.68410325050354, "sft_loss": 1.323227047920227, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 10.417386352993438, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.1916198432445526, "logits/rejected": -0.04756034165620804, "logps/chosen": -1.3450605869293213, "logps/rejected": -1.8921089172363281, "loss": 1.0104, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3450605869293213, "rewards/margins": 0.5470482110977173, "rewards/rejected": -1.8921089172363281, "sft_loss": 1.349582552909851, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 6.542569036259526, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.16324032843112946, "logits/rejected": 0.029138848185539246, "logps/chosen": -1.2792186737060547, "logps/rejected": -1.7051481008529663, "loss": 1.0097, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2792186737060547, "rewards/margins": 0.42592939734458923, "rewards/rejected": -1.7051481008529663, "sft_loss": 1.2989497184753418, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 5.6759487246974185, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.13519462943077087, "logits/rejected": 0.031025957316160202, "logps/chosen": -1.2971508502960205, "logps/rejected": -1.6557174921035767, "loss": 1.0111, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2971508502960205, "rewards/margins": 0.3585665225982666, "rewards/rejected": -1.6557174921035767, "sft_loss": 1.3277194499969482, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 7.694915042722196, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.10741202533245087, "logits/rejected": -0.057648736983537674, "logps/chosen": -1.2721848487854004, "logps/rejected": -1.5831544399261475, "loss": 1.031, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2721848487854004, "rewards/margins": 0.3109695315361023, "rewards/rejected": -1.5831544399261475, "sft_loss": 1.3286257982254028, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 6.7435684660765185, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.02682996354997158, "logits/rejected": -0.0047081769444048405, "logps/chosen": -1.2474782466888428, "logps/rejected": -1.6963155269622803, "loss": 0.9555, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2474782466888428, "rewards/margins": 0.44883736968040466, "rewards/rejected": -1.6963155269622803, "sft_loss": 1.2800469398498535, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 8.174106215852202, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.11709091812372208, "logits/rejected": -0.0057418374344706535, "logps/chosen": -1.3883098363876343, "logps/rejected": -1.6163625717163086, "loss": 1.1113, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3883098363876343, "rewards/margins": 0.22805269062519073, "rewards/rejected": -1.6163625717163086, "sft_loss": 1.3883463144302368, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 7.346237599422651, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.06870261579751968, "logits/rejected": -0.08070772886276245, "logps/chosen": -1.3321738243103027, "logps/rejected": -1.5784136056900024, "loss": 1.0576, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3321738243103027, "rewards/margins": 0.24623961746692657, "rewards/rejected": -1.5784136056900024, "sft_loss": 1.3463549613952637, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 10.244188574833444, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.17542986571788788, "logits/rejected": -0.07479909807443619, "logps/chosen": -1.381115436553955, "logps/rejected": -1.6750051975250244, "loss": 1.076, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.381115436553955, "rewards/margins": 0.29388970136642456, "rewards/rejected": -1.6750051975250244, "sft_loss": 1.4426143169403076, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 8.758781027902057, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.2611498534679413, "logits/rejected": -0.08540613949298859, "logps/chosen": -1.3195956945419312, "logps/rejected": -1.7180702686309814, "loss": 1.0156, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3195956945419312, "rewards/margins": 0.39847445487976074, "rewards/rejected": -1.7180702686309814, "sft_loss": 1.3189489841461182, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 7.234792016644986, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.13181889057159424, "logits/rejected": -0.005660903174430132, "logps/chosen": -1.2871992588043213, "logps/rejected": -1.7353289127349854, "loss": 0.997, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2871992588043213, "rewards/margins": 0.448129802942276, "rewards/rejected": -1.7353289127349854, "sft_loss": 1.313063621520996, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 7.931257532736728, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.22026875615119934, "logits/rejected": -0.05739130824804306, "logps/chosen": -1.2202383279800415, "logps/rejected": -1.5603992938995361, "loss": 1.0085, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2202383279800415, "rewards/margins": 0.34016093611717224, "rewards/rejected": -1.5603992938995361, "sft_loss": 1.2787656784057617, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 6.290230551872891, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.1508043259382248, "logits/rejected": 0.047995325177907944, "logps/chosen": -1.366620659828186, "logps/rejected": -1.7611169815063477, "loss": 1.029, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.366620659828186, "rewards/margins": 0.39449644088745117, "rewards/rejected": -1.7611169815063477, "sft_loss": 1.3677552938461304, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 5.940764257248773, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.21726492047309875, "logits/rejected": -0.052529554814100266, "logps/chosen": -1.2305254936218262, "logps/rejected": -1.570896029472351, "loss": 1.0202, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2305254936218262, "rewards/margins": 0.3403705954551697, "rewards/rejected": -1.570896029472351, "sft_loss": 1.2762455940246582, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 5.630415390245314, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.17441818118095398, "logits/rejected": -0.08711175620555878, "logps/chosen": -1.306196928024292, "logps/rejected": -1.6996771097183228, "loss": 1.0499, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.306196928024292, "rewards/margins": 0.393480122089386, "rewards/rejected": -1.6996771097183228, "sft_loss": 1.4036033153533936, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 11.230895712558812, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.09944915771484375, "logits/rejected": 0.03277222439646721, "logps/chosen": -1.3344061374664307, "logps/rejected": -1.6166597604751587, "loss": 1.0808, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3344061374664307, "rewards/margins": 0.2822534739971161, "rewards/rejected": -1.6166597604751587, "sft_loss": 1.3840522766113281, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 7.693466549251898, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.14864280819892883, "logits/rejected": 0.0014330834383144975, "logps/chosen": -1.4466898441314697, "logps/rejected": -1.8557466268539429, "loss": 1.0487, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4466898441314697, "rewards/margins": 0.40905675292015076, "rewards/rejected": -1.8557466268539429, "sft_loss": 1.406456708908081, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 8.179927899991316, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.0914086252450943, "logits/rejected": 0.19686779379844666, "logps/chosen": -1.3624199628829956, "logps/rejected": -1.8249889612197876, "loss": 1.0177, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3624199628829956, "rewards/margins": 0.46256914734840393, "rewards/rejected": -1.8249889612197876, "sft_loss": 1.3630585670471191, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 8.392424794281267, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.06835584342479706, "logits/rejected": 0.12229911237955093, "logps/chosen": -1.379624605178833, "logps/rejected": -1.802242636680603, "loss": 1.0556, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.379624605178833, "rewards/margins": 0.42261797189712524, "rewards/rejected": -1.802242636680603, "sft_loss": 1.4104360342025757, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 7.796472695999387, "learning_rate": 7.286726973755554e-07, "logits/chosen": 0.007735180668532848, "logits/rejected": 0.037343163043260574, "logps/chosen": -1.3578131198883057, "logps/rejected": -1.7218258380889893, "loss": 1.0405, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3578131198883057, "rewards/margins": 0.3640126585960388, "rewards/rejected": -1.7218258380889893, "sft_loss": 1.3624298572540283, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 7.3567407076824205, "learning_rate": 7.272866090342493e-07, "logits/chosen": 0.05104445293545723, "logits/rejected": 0.1391843557357788, "logps/chosen": -1.3850175142288208, "logps/rejected": -1.7918422222137451, "loss": 1.0238, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3850175142288208, "rewards/margins": 0.40682488679885864, "rewards/rejected": -1.7918422222137451, "sft_loss": 1.3535354137420654, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 6.877849690366655, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.13437165319919586, "logits/rejected": -0.012639102526009083, "logps/chosen": -1.3359447717666626, "logps/rejected": -1.663922667503357, "loss": 1.0615, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3359447717666626, "rewards/margins": 0.32797807455062866, "rewards/rejected": -1.663922667503357, "sft_loss": 1.3866608142852783, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 5.778524261215048, "learning_rate": 7.245078304138335e-07, "logits/chosen": 0.009187871590256691, "logits/rejected": 0.07843149453401566, "logps/chosen": -1.3242933750152588, "logps/rejected": -1.7084728479385376, "loss": 1.0208, "rewards/accuracies": 0.625, "rewards/chosen": -1.3242933750152588, "rewards/margins": 0.38417941331863403, "rewards/rejected": -1.7084728479385376, "sft_loss": 1.3560478687286377, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 5.6294753375067295, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.17960448563098907, "logits/rejected": 0.014394590631127357, "logps/chosen": -1.339212417602539, "logps/rejected": -1.6445707082748413, "loss": 1.0601, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.339212417602539, "rewards/margins": 0.3053584098815918, "rewards/rejected": -1.6445707082748413, "sft_loss": 1.3502283096313477, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 6.783265605580876, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.08887577801942825, "logits/rejected": 0.08649233728647232, "logps/chosen": -1.3186254501342773, "logps/rejected": -1.6747783422470093, "loss": 1.0501, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3186254501342773, "rewards/margins": 0.3561529517173767, "rewards/rejected": -1.6747783422470093, "sft_loss": 1.3644529581069946, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 6.405760070279392, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.08319975435733795, "logits/rejected": 0.0747746080160141, "logps/chosen": -1.3581855297088623, "logps/rejected": -1.646794080734253, "loss": 1.06, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3581855297088623, "rewards/margins": 0.2886084318161011, "rewards/rejected": -1.646794080734253, "sft_loss": 1.3697419166564941, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 7.571472104965407, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.04250494763255119, "logits/rejected": 0.10361369699239731, "logps/chosen": -1.2371678352355957, "logps/rejected": -1.7140600681304932, "loss": 1.0039, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2371678352355957, "rewards/margins": 0.4768921732902527, "rewards/rejected": -1.7140600681304932, "sft_loss": 1.3077231645584106, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 10.78619174464002, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.07091349363327026, "logits/rejected": 0.06878503412008286, "logps/chosen": -1.305056095123291, "logps/rejected": -1.8479982614517212, "loss": 0.9951, "rewards/accuracies": 0.65625, "rewards/chosen": -1.305056095123291, "rewards/margins": 0.5429421663284302, "rewards/rejected": -1.8479982614517212, "sft_loss": 1.3788334131240845, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 5.651471963037841, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.05510222911834717, "logits/rejected": 0.0910344272851944, "logps/chosen": -1.3466378450393677, "logps/rejected": -1.7891864776611328, "loss": 1.0345, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3466378450393677, "rewards/margins": 0.4425484538078308, "rewards/rejected": -1.7891864776611328, "sft_loss": 1.4098457098007202, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 7.355148492814228, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.018819155171513557, "logits/rejected": 0.11880254745483398, "logps/chosen": -1.3327100276947021, "logps/rejected": -1.7054532766342163, "loss": 1.0319, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3327100276947021, "rewards/margins": 0.3727432191371918, "rewards/rejected": -1.7054532766342163, "sft_loss": 1.4049968719482422, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 10.943024429606822, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.1734219342470169, "logits/rejected": 0.013913175091147423, "logps/chosen": -1.247011423110962, "logps/rejected": -1.558885931968689, "loss": 1.0241, "rewards/accuracies": 0.59375, "rewards/chosen": -1.247011423110962, "rewards/margins": 0.3118746280670166, "rewards/rejected": -1.558885931968689, "sft_loss": 1.2846260070800781, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 11.851152770608321, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.13905705511569977, "logits/rejected": 0.09097929298877716, "logps/chosen": -1.3812119960784912, "logps/rejected": -1.8251721858978271, "loss": 1.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3812119960784912, "rewards/margins": 0.4439600110054016, "rewards/rejected": -1.8251721858978271, "sft_loss": 1.4391671419143677, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 5.63380268619042, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.034522850066423416, "logits/rejected": 0.0760420560836792, "logps/chosen": -1.3381052017211914, "logps/rejected": -1.6845115423202515, "loss": 1.0354, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3381052017211914, "rewards/margins": 0.34640640020370483, "rewards/rejected": -1.6845115423202515, "sft_loss": 1.3170645236968994, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 5.798808205023328, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.017745880410075188, "logits/rejected": 0.04981974512338638, "logps/chosen": -1.2860960960388184, "logps/rejected": -1.7253410816192627, "loss": 0.9965, "rewards/accuracies": 0.625, "rewards/chosen": -1.2860960960388184, "rewards/margins": 0.43924492597579956, "rewards/rejected": -1.7253410816192627, "sft_loss": 1.348103642463684, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 6.97187103687865, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.24344687163829803, "logits/rejected": -0.10979632288217545, "logps/chosen": -1.232178807258606, "logps/rejected": -1.6103588342666626, "loss": 0.9885, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.232178807258606, "rewards/margins": 0.37818005681037903, "rewards/rejected": -1.6103588342666626, "sft_loss": 1.2975971698760986, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 7.840210741237986, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.07378415018320084, "logits/rejected": 0.05714235454797745, "logps/chosen": -1.2715994119644165, "logps/rejected": -1.623050332069397, "loss": 1.0133, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2715994119644165, "rewards/margins": 0.3514510989189148, "rewards/rejected": -1.623050332069397, "sft_loss": 1.3193587064743042, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.2707947790622711, "eval_logits/rejected": 0.3701152205467224, "eval_logps/chosen": -1.3638824224472046, "eval_logps/rejected": -1.7321114540100098, "eval_loss": 1.046066164970398, "eval_rewards/accuracies": 0.6053412556648254, "eval_rewards/chosen": -1.3638824224472046, "eval_rewards/margins": 0.36822912096977234, "eval_rewards/rejected": -1.7321114540100098, "eval_runtime": 43.3451, "eval_samples_per_second": 31.03, "eval_sft_loss": 1.3863850831985474, "eval_steps_per_second": 7.775, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 6.179847425136886, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.056697629392147064, "logits/rejected": 0.10876087844371796, "logps/chosen": -1.3541631698608398, "logps/rejected": -1.806817650794983, "loss": 1.0321, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3541631698608398, "rewards/margins": 0.4526546001434326, "rewards/rejected": -1.806817650794983, "sft_loss": 1.3751753568649292, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 6.282016623035913, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.18400783836841583, "logits/rejected": 0.03770359605550766, "logps/chosen": -1.3120901584625244, "logps/rejected": -1.5234274864196777, "loss": 1.0655, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3120901584625244, "rewards/margins": 0.2113373726606369, "rewards/rejected": -1.5234274864196777, "sft_loss": 1.3530489206314087, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 6.879817145949223, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.04526624456048012, "logits/rejected": 0.03693125396966934, "logps/chosen": -1.24125337600708, "logps/rejected": -1.7764074802398682, "loss": 0.9613, "rewards/accuracies": 0.65625, "rewards/chosen": -1.24125337600708, "rewards/margins": 0.5351541042327881, "rewards/rejected": -1.7764074802398682, "sft_loss": 1.2563873529434204, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 5.706686252449379, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.17454300820827484, "logits/rejected": -0.03778272494673729, "logps/chosen": -1.3651527166366577, "logps/rejected": -1.8023183345794678, "loss": 1.0069, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3651527166366577, "rewards/margins": 0.43716558814048767, "rewards/rejected": -1.8023183345794678, "sft_loss": 1.4338901042938232, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 8.818330042647473, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.03695257008075714, "logits/rejected": 0.04921901971101761, "logps/chosen": -1.2691466808319092, "logps/rejected": -1.5711511373519897, "loss": 1.0228, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2691466808319092, "rewards/margins": 0.30200451612472534, "rewards/rejected": -1.5711511373519897, "sft_loss": 1.3589909076690674, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 8.297898842674472, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.029715800657868385, "logits/rejected": 0.031632810831069946, "logps/chosen": -1.4271819591522217, "logps/rejected": -1.768367052078247, "loss": 1.0625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4271819591522217, "rewards/margins": 0.3411853313446045, "rewards/rejected": -1.768367052078247, "sft_loss": 1.4373775720596313, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 5.81668195731196, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.02307548001408577, "logits/rejected": 0.10794766992330551, "logps/chosen": -1.303928256034851, "logps/rejected": -1.580761194229126, "loss": 1.0365, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.303928256034851, "rewards/margins": 0.2768331468105316, "rewards/rejected": -1.580761194229126, "sft_loss": 1.3223628997802734, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 7.254516734316676, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.15354299545288086, "logits/rejected": -0.027570974081754684, "logps/chosen": -1.2981069087982178, "logps/rejected": -1.7389848232269287, "loss": 0.9941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2981069087982178, "rewards/margins": 0.44087809324264526, "rewards/rejected": -1.7389848232269287, "sft_loss": 1.2723134756088257, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 6.238727861139878, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.14542250335216522, "logits/rejected": 0.005346921272575855, "logps/chosen": -1.3731145858764648, "logps/rejected": -1.6807750463485718, "loss": 1.062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3731145858764648, "rewards/margins": 0.3076605498790741, "rewards/rejected": -1.6807750463485718, "sft_loss": 1.3621718883514404, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 7.541602340260906, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.11518025398254395, "logits/rejected": 0.05123286694288254, "logps/chosen": -1.3132606744766235, "logps/rejected": -1.8348419666290283, "loss": 0.9956, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3132606744766235, "rewards/margins": 0.5215811133384705, "rewards/rejected": -1.8348419666290283, "sft_loss": 1.3019697666168213, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 4.756591565678028, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.13995513319969177, "logits/rejected": 0.10645530372858047, "logps/chosen": -1.382127046585083, "logps/rejected": -1.7566617727279663, "loss": 1.0435, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.382127046585083, "rewards/margins": 0.37453463673591614, "rewards/rejected": -1.7566617727279663, "sft_loss": 1.4149354696273804, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 11.56479469442216, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.10803036391735077, "logits/rejected": 0.005168232135474682, "logps/chosen": -1.3827905654907227, "logps/rejected": -1.649247169494629, "loss": 1.0953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3827905654907227, "rewards/margins": 0.26645663380622864, "rewards/rejected": -1.649247169494629, "sft_loss": 1.3851019144058228, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 7.275890781310672, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.07685400545597076, "logits/rejected": 0.005793456919491291, "logps/chosen": -1.3236695528030396, "logps/rejected": -1.7799656391143799, "loss": 1.018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3236695528030396, "rewards/margins": 0.45629605650901794, "rewards/rejected": -1.7799656391143799, "sft_loss": 1.3315627574920654, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 7.208833715018191, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.23564176261425018, "logits/rejected": -0.09703844785690308, "logps/chosen": -1.2878791093826294, "logps/rejected": -1.5357494354248047, "loss": 1.0621, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2878791093826294, "rewards/margins": 0.2478702962398529, "rewards/rejected": -1.5357494354248047, "sft_loss": 1.35406494140625, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 6.192328808121661, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.12468357384204865, "logits/rejected": -0.04723244532942772, "logps/chosen": -1.3072025775909424, "logps/rejected": -1.6626510620117188, "loss": 1.0168, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3072025775909424, "rewards/margins": 0.35544854402542114, "rewards/rejected": -1.6626510620117188, "sft_loss": 1.3535258769989014, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 5.514122164787027, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.25773757696151733, "logits/rejected": -0.0840383991599083, "logps/chosen": -1.3327374458312988, "logps/rejected": -1.7903152704238892, "loss": 1.0202, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3327374458312988, "rewards/margins": 0.4575781226158142, "rewards/rejected": -1.7903152704238892, "sft_loss": 1.4295130968093872, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 6.430291188228354, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.07060518860816956, "logits/rejected": 0.11927783489227295, "logps/chosen": -1.3509200811386108, "logps/rejected": -1.8332151174545288, "loss": 1.0684, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3509200811386108, "rewards/margins": 0.48229488730430603, "rewards/rejected": -1.8332151174545288, "sft_loss": 1.4322224855422974, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 9.207303108778117, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.19833900034427643, "logits/rejected": -0.13946710526943207, "logps/chosen": -1.3640668392181396, "logps/rejected": -1.7328455448150635, "loss": 1.0381, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3640668392181396, "rewards/margins": 0.368778795003891, "rewards/rejected": -1.7328455448150635, "sft_loss": 1.4077961444854736, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 6.748889211680902, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.08480075001716614, "logits/rejected": -0.07643101364374161, "logps/chosen": -1.3230348825454712, "logps/rejected": -1.7031409740447998, "loss": 1.0347, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3230348825454712, "rewards/margins": 0.3801063001155853, "rewards/rejected": -1.7031409740447998, "sft_loss": 1.3512378931045532, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 7.100440960952916, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.05479288101196289, "logits/rejected": 0.11367790400981903, "logps/chosen": -1.273905634880066, "logps/rejected": -1.6257766485214233, "loss": 1.0125, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.273905634880066, "rewards/margins": 0.3518711030483246, "rewards/rejected": -1.6257766485214233, "sft_loss": 1.3452768325805664, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 6.950889135224402, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.0914456844329834, "logits/rejected": 0.08299463987350464, "logps/chosen": -1.2730414867401123, "logps/rejected": -1.665330171585083, "loss": 1.0166, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2730414867401123, "rewards/margins": 0.3922889232635498, "rewards/rejected": -1.665330171585083, "sft_loss": 1.2936381101608276, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 9.560143778048294, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.15366467833518982, "logits/rejected": -0.012053056620061398, "logps/chosen": -1.2696418762207031, "logps/rejected": -1.6315727233886719, "loss": 1.0137, "rewards/accuracies": 0.625, "rewards/chosen": -1.2696418762207031, "rewards/margins": 0.36193081736564636, "rewards/rejected": -1.6315727233886719, "sft_loss": 1.322128176689148, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 7.348808577763548, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.07675446569919586, "logits/rejected": 0.04603142291307449, "logps/chosen": -1.337294578552246, "logps/rejected": -1.6499254703521729, "loss": 1.0765, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.337294578552246, "rewards/margins": 0.312630832195282, "rewards/rejected": -1.6499254703521729, "sft_loss": 1.409766435623169, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 6.2283781946087915, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.19296064972877502, "logits/rejected": -0.1111554503440857, "logps/chosen": -1.3319435119628906, "logps/rejected": -1.645559310913086, "loss": 1.0162, "rewards/accuracies": 0.625, "rewards/chosen": -1.3319435119628906, "rewards/margins": 0.3136158883571625, "rewards/rejected": -1.645559310913086, "sft_loss": 1.3112457990646362, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 7.404843081341424, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.1959419697523117, "logits/rejected": -0.1169554591178894, "logps/chosen": -1.3728597164154053, "logps/rejected": -1.762760877609253, "loss": 1.0579, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3728597164154053, "rewards/margins": 0.38990116119384766, "rewards/rejected": -1.762760877609253, "sft_loss": 1.4591501951217651, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 8.809403280161979, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.2071094512939453, "logits/rejected": -0.15805336833000183, "logps/chosen": -1.3241145610809326, "logps/rejected": -1.590745210647583, "loss": 1.0759, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3241145610809326, "rewards/margins": 0.2666308283805847, "rewards/rejected": -1.590745210647583, "sft_loss": 1.3455761671066284, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 4.819707847104657, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.2637235224246979, "logits/rejected": -0.1225053071975708, "logps/chosen": -1.3090099096298218, "logps/rejected": -1.7086610794067383, "loss": 1.0081, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3090099096298218, "rewards/margins": 0.39965111017227173, "rewards/rejected": -1.7086610794067383, "sft_loss": 1.3366161584854126, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 6.503670880002155, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.1527249813079834, "logits/rejected": 0.020453324541449547, "logps/chosen": -1.418816089630127, "logps/rejected": -1.6873157024383545, "loss": 1.0984, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.418816089630127, "rewards/margins": 0.2684997320175171, "rewards/rejected": -1.6873157024383545, "sft_loss": 1.475125789642334, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 5.701889647109998, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.1425601989030838, "logits/rejected": 0.027763869613409042, "logps/chosen": -1.3176789283752441, "logps/rejected": -1.7970898151397705, "loss": 1.0188, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3176789283752441, "rewards/margins": 0.4794110357761383, "rewards/rejected": -1.7970898151397705, "sft_loss": 1.373015284538269, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 7.455068759266927, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.1664392203092575, "logits/rejected": 0.0011624842882156372, "logps/chosen": -1.3408262729644775, "logps/rejected": -1.6157557964324951, "loss": 1.0608, "rewards/accuracies": 0.625, "rewards/chosen": -1.3408262729644775, "rewards/margins": 0.274929404258728, "rewards/rejected": -1.6157557964324951, "sft_loss": 1.3625861406326294, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 11.204779201931467, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.12884119153022766, "logits/rejected": -0.03482980281114578, "logps/chosen": -1.2732433080673218, "logps/rejected": -1.7895492315292358, "loss": 1.0184, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2732433080673218, "rewards/margins": 0.5163058638572693, "rewards/rejected": -1.7895492315292358, "sft_loss": 1.3326706886291504, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 10.327413438865861, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.12610045075416565, "logits/rejected": -0.00018233135051559657, "logps/chosen": -1.331601858139038, "logps/rejected": -1.7389627695083618, "loss": 1.0157, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.331601858139038, "rewards/margins": 0.4073609709739685, "rewards/rejected": -1.7389627695083618, "sft_loss": 1.3417237997055054, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 12.337276545842748, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.13059648871421814, "logits/rejected": -0.0020403326489031315, "logps/chosen": -1.3097715377807617, "logps/rejected": -1.7724525928497314, "loss": 1.019, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3097715377807617, "rewards/margins": 0.4626809060573578, "rewards/rejected": -1.7724525928497314, "sft_loss": 1.346893072128296, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 6.504970472580104, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.03162531182169914, "logits/rejected": 0.038547057658433914, "logps/chosen": -1.3186380863189697, "logps/rejected": -1.7203162908554077, "loss": 1.0231, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3186380863189697, "rewards/margins": 0.4016784727573395, "rewards/rejected": -1.7203162908554077, "sft_loss": 1.3438231945037842, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 6.770026584337929, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.20331497490406036, "logits/rejected": -0.05360947176814079, "logps/chosen": -1.2827414274215698, "logps/rejected": -1.6763532161712646, "loss": 1.0076, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2827414274215698, "rewards/margins": 0.393611878156662, "rewards/rejected": -1.6763532161712646, "sft_loss": 1.3209357261657715, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 6.561231636640119, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.13221685588359833, "logits/rejected": -0.05130600929260254, "logps/chosen": -1.366477131843567, "logps/rejected": -1.6261088848114014, "loss": 1.0784, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.366477131843567, "rewards/margins": 0.2596319019794464, "rewards/rejected": -1.6261088848114014, "sft_loss": 1.4306046962738037, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 8.893681756902707, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.035487428307533264, "logits/rejected": 0.130559042096138, "logps/chosen": -1.2470409870147705, "logps/rejected": -1.6920645236968994, "loss": 0.9589, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2470409870147705, "rewards/margins": 0.4450235962867737, "rewards/rejected": -1.6920645236968994, "sft_loss": 1.2662039995193481, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 8.02611117109066, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.15100671350955963, "logits/rejected": -0.009929725900292397, "logps/chosen": -1.280668020248413, "logps/rejected": -1.6584885120391846, "loss": 1.0392, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.280668020248413, "rewards/margins": 0.37782055139541626, "rewards/rejected": -1.6584885120391846, "sft_loss": 1.3776158094406128, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 6.564305665115609, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.08805367350578308, "logits/rejected": -0.04162890464067459, "logps/chosen": -1.300323486328125, "logps/rejected": -1.6258924007415771, "loss": 1.0244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.300323486328125, "rewards/margins": 0.325569212436676, "rewards/rejected": -1.6258924007415771, "sft_loss": 1.3119922876358032, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 8.780559757186948, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.02691541239619255, "logits/rejected": 0.10557906329631805, "logps/chosen": -1.322824239730835, "logps/rejected": -1.562229871749878, "loss": 1.062, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.322824239730835, "rewards/margins": 0.2394055873155594, "rewards/rejected": -1.562229871749878, "sft_loss": 1.3450852632522583, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 9.368043742615306, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.050204623490571976, "logits/rejected": -0.002213549567386508, "logps/chosen": -1.3146531581878662, "logps/rejected": -1.6026958227157593, "loss": 1.0455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3146531581878662, "rewards/margins": 0.28804266452789307, "rewards/rejected": -1.6026958227157593, "sft_loss": 1.3670458793640137, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 10.781070285392934, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.007836557924747467, "logits/rejected": 0.09162791818380356, "logps/chosen": -1.3540400266647339, "logps/rejected": -1.6941732168197632, "loss": 1.084, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3540400266647339, "rewards/margins": 0.3401332199573517, "rewards/rejected": -1.6941732168197632, "sft_loss": 1.3959096670150757, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 18.03243421842275, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.14814485609531403, "logits/rejected": -0.009653128683567047, "logps/chosen": -1.2639660835266113, "logps/rejected": -1.7054649591445923, "loss": 1.0177, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2639660835266113, "rewards/margins": 0.4414988160133362, "rewards/rejected": -1.7054649591445923, "sft_loss": 1.331084132194519, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 8.824553369123494, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.13758358359336853, "logits/rejected": -0.13457614183425903, "logps/chosen": -1.3871793746948242, "logps/rejected": -1.588173270225525, "loss": 1.1179, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3871793746948242, "rewards/margins": 0.20099392533302307, "rewards/rejected": -1.588173270225525, "sft_loss": 1.4249435663223267, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 9.777034551382114, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.0834670439362526, "logits/rejected": 0.0681837722659111, "logps/chosen": -1.333077073097229, "logps/rejected": -1.6984277963638306, "loss": 1.027, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.333077073097229, "rewards/margins": 0.3653508424758911, "rewards/rejected": -1.6984277963638306, "sft_loss": 1.3320882320404053, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 9.18561160006191, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.24002377688884735, "logits/rejected": -0.04056672379374504, "logps/chosen": -1.3651529550552368, "logps/rejected": -1.670501708984375, "loss": 1.0717, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3651529550552368, "rewards/margins": 0.30534881353378296, "rewards/rejected": -1.670501708984375, "sft_loss": 1.3859248161315918, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 18.481416157033973, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.1223798543214798, "logits/rejected": -0.030973097309470177, "logps/chosen": -1.3061634302139282, "logps/rejected": -1.631188988685608, "loss": 1.038, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3061634302139282, "rewards/margins": 0.325025349855423, "rewards/rejected": -1.631188988685608, "sft_loss": 1.3667783737182617, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 5.841523887577847, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.08212809264659882, "logits/rejected": -0.0031019255984574556, "logps/chosen": -1.3627724647521973, "logps/rejected": -1.6993802785873413, "loss": 1.0494, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3627724647521973, "rewards/margins": 0.3366078734397888, "rewards/rejected": -1.6993802785873413, "sft_loss": 1.36763596534729, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 9.005289663929863, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.12843796610832214, "logits/rejected": -0.047466933727264404, "logps/chosen": -1.3065561056137085, "logps/rejected": -1.6848160028457642, "loss": 1.0111, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3065561056137085, "rewards/margins": 0.37825995683670044, "rewards/rejected": -1.6848160028457642, "sft_loss": 1.3081876039505005, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 5.470887175371584, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.18649035692214966, "logits/rejected": -0.03194127231836319, "logps/chosen": -1.3147566318511963, "logps/rejected": -1.8917913436889648, "loss": 0.9953, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3147566318511963, "rewards/margins": 0.5770348310470581, "rewards/rejected": -1.8917913436889648, "sft_loss": 1.352008581161499, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 9.071697896178694, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.10353100299835205, "logits/rejected": 0.05252040550112724, "logps/chosen": -1.2148979902267456, "logps/rejected": -1.5812180042266846, "loss": 0.9736, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2148979902267456, "rewards/margins": 0.3663199543952942, "rewards/rejected": -1.5812180042266846, "sft_loss": 1.262518286705017, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 12.04479228633977, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.03726924955844879, "logits/rejected": 0.014096438884735107, "logps/chosen": -1.2449657917022705, "logps/rejected": -1.7064529657363892, "loss": 0.9874, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2449657917022705, "rewards/margins": 0.4614872336387634, "rewards/rejected": -1.7064529657363892, "sft_loss": 1.2565490007400513, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 7.666946249067871, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.18409328162670135, "logits/rejected": -0.044957343488931656, "logps/chosen": -1.3527686595916748, "logps/rejected": -1.743014931678772, "loss": 1.0477, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3527686595916748, "rewards/margins": 0.39024627208709717, "rewards/rejected": -1.743014931678772, "sft_loss": 1.468995213508606, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 10.556427345967442, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.06500004231929779, "logits/rejected": 0.026881689205765724, "logps/chosen": -1.4370088577270508, "logps/rejected": -1.703037977218628, "loss": 1.091, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4370088577270508, "rewards/margins": 0.2660290002822876, "rewards/rejected": -1.703037977218628, "sft_loss": 1.4438323974609375, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 12.558739727033354, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.13178256154060364, "logits/rejected": 0.031609587371349335, "logps/chosen": -1.3780395984649658, "logps/rejected": -1.6814756393432617, "loss": 1.0411, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3780395984649658, "rewards/margins": 0.3034361004829407, "rewards/rejected": -1.6814756393432617, "sft_loss": 1.2921960353851318, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 10.25144106346559, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.17347553372383118, "logits/rejected": -0.004042397253215313, "logps/chosen": -1.3607873916625977, "logps/rejected": -1.6869118213653564, "loss": 1.0642, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3607873916625977, "rewards/margins": 0.3261243999004364, "rewards/rejected": -1.6869118213653564, "sft_loss": 1.4142537117004395, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 8.068433911299971, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.1763458549976349, "logits/rejected": 0.03705815225839615, "logps/chosen": -1.3179851770401, "logps/rejected": -1.624053716659546, "loss": 1.0269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3179851770401, "rewards/margins": 0.3060687780380249, "rewards/rejected": -1.624053716659546, "sft_loss": 1.3464586734771729, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 21.85675698669309, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.30096036195755005, "logits/rejected": -0.06160721182823181, "logps/chosen": -1.345220685005188, "logps/rejected": -1.830958604812622, "loss": 1.0149, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.345220685005188, "rewards/margins": 0.48573771119117737, "rewards/rejected": -1.830958604812622, "sft_loss": 1.3653134107589722, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 12.176114443779284, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.0862717255949974, "logits/rejected": -0.054527319967746735, "logps/chosen": -1.33372962474823, "logps/rejected": -1.7393748760223389, "loss": 1.0235, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.33372962474823, "rewards/margins": 0.4056454300880432, "rewards/rejected": -1.7393748760223389, "sft_loss": 1.3457729816436768, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 9.070582268020733, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.16579201817512512, "logits/rejected": 0.02601494826376438, "logps/chosen": -1.3615131378173828, "logps/rejected": -1.7076470851898193, "loss": 1.0667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3615131378173828, "rewards/margins": 0.34613385796546936, "rewards/rejected": -1.7076470851898193, "sft_loss": 1.4235488176345825, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 5.839713039735426, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.1995636522769928, "logits/rejected": 0.01864362321794033, "logps/chosen": -1.218213677406311, "logps/rejected": -1.7912061214447021, "loss": 0.918, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.218213677406311, "rewards/margins": 0.5729925036430359, "rewards/rejected": -1.7912061214447021, "sft_loss": 1.2121398448944092, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 9.179052304791725, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.09858128428459167, "logits/rejected": -0.066671222448349, "logps/chosen": -1.3320286273956299, "logps/rejected": -1.705482840538025, "loss": 1.0185, "rewards/accuracies": 0.625, "rewards/chosen": -1.3320286273956299, "rewards/margins": 0.37345418334007263, "rewards/rejected": -1.705482840538025, "sft_loss": 1.3238738775253296, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 5.483634729918384, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.2557533383369446, "logits/rejected": -0.15631580352783203, "logps/chosen": -1.2451080083847046, "logps/rejected": -1.5999696254730225, "loss": 1.0356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2451080083847046, "rewards/margins": 0.35486167669296265, "rewards/rejected": -1.5999696254730225, "sft_loss": 1.3500633239746094, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 6.63302785728638, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.18139012157917023, "logits/rejected": -0.0644666850566864, "logps/chosen": -1.298688530921936, "logps/rejected": -1.5520808696746826, "loss": 1.0355, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.298688530921936, "rewards/margins": 0.2533922791481018, "rewards/rejected": -1.5520808696746826, "sft_loss": 1.297526478767395, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 7.301750134692686, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.1751355677843094, "logits/rejected": -0.06084384769201279, "logps/chosen": -1.2951709032058716, "logps/rejected": -1.6943944692611694, "loss": 1.004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2951709032058716, "rewards/margins": 0.399223655462265, "rewards/rejected": -1.6943944692611694, "sft_loss": 1.303188443183899, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 7.705834050978145, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.21830907464027405, "logits/rejected": -0.04081263020634651, "logps/chosen": -1.3108503818511963, "logps/rejected": -1.7535419464111328, "loss": 1.024, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3108503818511963, "rewards/margins": 0.44269147515296936, "rewards/rejected": -1.7535419464111328, "sft_loss": 1.3522942066192627, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 7.187849820236266, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.19904953241348267, "logits/rejected": -0.0006094604614190757, "logps/chosen": -1.2227904796600342, "logps/rejected": -1.7524545192718506, "loss": 0.9557, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2227904796600342, "rewards/margins": 0.5296639204025269, "rewards/rejected": -1.7524545192718506, "sft_loss": 1.3017102479934692, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 10.480615331176718, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.1879226267337799, "logits/rejected": -0.06761939823627472, "logps/chosen": -1.293309211730957, "logps/rejected": -1.8878666162490845, "loss": 0.984, "rewards/accuracies": 0.625, "rewards/chosen": -1.293309211730957, "rewards/margins": 0.5945574045181274, "rewards/rejected": -1.8878666162490845, "sft_loss": 1.3281826972961426, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 6.39305835700618, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.17131337523460388, "logits/rejected": -0.03572530299425125, "logps/chosen": -1.2976057529449463, "logps/rejected": -1.6221458911895752, "loss": 1.0387, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2976057529449463, "rewards/margins": 0.32454022765159607, "rewards/rejected": -1.6221458911895752, "sft_loss": 1.3275409936904907, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 6.729328891716704, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.15203949809074402, "logits/rejected": -0.045757196843624115, "logps/chosen": -1.3872594833374023, "logps/rejected": -1.879111886024475, "loss": 1.0008, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3872594833374023, "rewards/margins": 0.49185243248939514, "rewards/rejected": -1.879111886024475, "sft_loss": 1.4086813926696777, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 15.87398183068991, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.11463092267513275, "logits/rejected": 0.03227987512946129, "logps/chosen": -1.310810923576355, "logps/rejected": -1.6783673763275146, "loss": 1.0297, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.310810923576355, "rewards/margins": 0.36755651235580444, "rewards/rejected": -1.6783673763275146, "sft_loss": 1.366980791091919, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 7.751583510401781, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.2286381721496582, "logits/rejected": -0.05059467628598213, "logps/chosen": -1.3588345050811768, "logps/rejected": -1.907141923904419, "loss": 0.9799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3588345050811768, "rewards/margins": 0.5483072996139526, "rewards/rejected": -1.907141923904419, "sft_loss": 1.3552730083465576, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 15.233468290648068, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.14470013976097107, "logits/rejected": -0.002064927713945508, "logps/chosen": -1.3128535747528076, "logps/rejected": -1.6920738220214844, "loss": 1.004, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3128535747528076, "rewards/margins": 0.37922030687332153, "rewards/rejected": -1.6920738220214844, "sft_loss": 1.3641343116760254, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 8.056494737856939, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.19823172688484192, "logits/rejected": -0.12065769731998444, "logps/chosen": -1.2947113513946533, "logps/rejected": -1.5259299278259277, "loss": 1.068, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2947113513946533, "rewards/margins": 0.23121850192546844, "rewards/rejected": -1.5259299278259277, "sft_loss": 1.3387796878814697, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 7.755205256691052, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.15351326763629913, "logits/rejected": -0.10742366313934326, "logps/chosen": -1.2829701900482178, "logps/rejected": -1.7134536504745483, "loss": 1.0134, "rewards/accuracies": 0.625, "rewards/chosen": -1.2829701900482178, "rewards/margins": 0.4304834008216858, "rewards/rejected": -1.7134536504745483, "sft_loss": 1.3123705387115479, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 6.88612596291791, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.14032743871212006, "logits/rejected": -0.03774075582623482, "logps/chosen": -1.4290128946304321, "logps/rejected": -1.7514575719833374, "loss": 1.0777, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4290128946304321, "rewards/margins": 0.3224448561668396, "rewards/rejected": -1.7514575719833374, "sft_loss": 1.4194713830947876, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 13.416853169884268, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.226902037858963, "logits/rejected": -0.09653354436159134, "logps/chosen": -1.400376558303833, "logps/rejected": -1.8006101846694946, "loss": 1.0668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.400376558303833, "rewards/margins": 0.40023356676101685, "rewards/rejected": -1.8006101846694946, "sft_loss": 1.423305869102478, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 8.481357085434302, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.13754570484161377, "logits/rejected": -0.15755626559257507, "logps/chosen": -1.2808914184570312, "logps/rejected": -1.6580407619476318, "loss": 1.0402, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2808914184570312, "rewards/margins": 0.37714946269989014, "rewards/rejected": -1.6580407619476318, "sft_loss": 1.3839619159698486, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 6.387669914543872, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.027116578072309494, "logits/rejected": -0.008474569767713547, "logps/chosen": -1.242453932762146, "logps/rejected": -1.5739244222640991, "loss": 1.0197, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.242453932762146, "rewards/margins": 0.3314705193042755, "rewards/rejected": -1.5739244222640991, "sft_loss": 1.29339599609375, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 19.1852011695506, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.21790878474712372, "logits/rejected": -0.13022013008594513, "logps/chosen": -1.415160894393921, "logps/rejected": -1.7492616176605225, "loss": 1.1144, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.415160894393921, "rewards/margins": 0.334100604057312, "rewards/rejected": -1.7492616176605225, "sft_loss": 1.4745256900787354, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.1278160661458969, "eval_logits/rejected": 0.21500340104103088, "eval_logps/chosen": -1.3652279376983643, "eval_logps/rejected": -1.7446658611297607, "eval_loss": 1.04434072971344, "eval_rewards/accuracies": 0.610534131526947, "eval_rewards/chosen": -1.3652279376983643, "eval_rewards/margins": 0.37943780422210693, "eval_rewards/rejected": -1.7446658611297607, "eval_runtime": 43.4312, "eval_samples_per_second": 30.969, "eval_sft_loss": 1.3886722326278687, "eval_steps_per_second": 7.759, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 6.648291877984254, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.3065665364265442, "logits/rejected": -0.15820252895355225, "logps/chosen": -1.1917800903320312, "logps/rejected": -1.6429237127304077, "loss": 0.9691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1917800903320312, "rewards/margins": 0.4511435925960541, "rewards/rejected": -1.6429237127304077, "sft_loss": 1.2733690738677979, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 11.874270050708041, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.184895321726799, "logits/rejected": -0.14258211851119995, "logps/chosen": -1.3601287603378296, "logps/rejected": -1.7205755710601807, "loss": 1.0659, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3601287603378296, "rewards/margins": 0.3604467213153839, "rewards/rejected": -1.7205755710601807, "sft_loss": 1.4537384510040283, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 6.962778367607981, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.09512137621641159, "logits/rejected": 0.032009050250053406, "logps/chosen": -1.3327128887176514, "logps/rejected": -1.7304967641830444, "loss": 1.0174, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3327128887176514, "rewards/margins": 0.397784024477005, "rewards/rejected": -1.7304967641830444, "sft_loss": 1.3504347801208496, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 6.566323185465078, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.13119642436504364, "logits/rejected": 0.04365091398358345, "logps/chosen": -1.2991206645965576, "logps/rejected": -1.5871660709381104, "loss": 1.0471, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2991206645965576, "rewards/margins": 0.2880452871322632, "rewards/rejected": -1.5871660709381104, "sft_loss": 1.3335323333740234, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 9.310666619697258, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.20145635306835175, "logits/rejected": -0.02964537963271141, "logps/chosen": -1.3808945417404175, "logps/rejected": -1.8684800863265991, "loss": 1.0127, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3808945417404175, "rewards/margins": 0.4875854551792145, "rewards/rejected": -1.8684800863265991, "sft_loss": 1.3790462017059326, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 7.469500192905114, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.16935193538665771, "logits/rejected": -0.0209256112575531, "logps/chosen": -1.390914797782898, "logps/rejected": -1.6186745166778564, "loss": 1.108, "rewards/accuracies": 0.5625, "rewards/chosen": -1.390914797782898, "rewards/margins": 0.22775951027870178, "rewards/rejected": -1.6186745166778564, "sft_loss": 1.451371431350708, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 7.823612798328366, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.15821883082389832, "logits/rejected": 0.004854840226471424, "logps/chosen": -1.3072322607040405, "logps/rejected": -1.6984754800796509, "loss": 1.0165, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3072322607040405, "rewards/margins": 0.3912431597709656, "rewards/rejected": -1.6984754800796509, "sft_loss": 1.3407630920410156, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 7.2601639097119275, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.15000715851783752, "logits/rejected": -0.042465973645448685, "logps/chosen": -1.3483084440231323, "logps/rejected": -1.6190989017486572, "loss": 1.0708, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3483084440231323, "rewards/margins": 0.27079030871391296, "rewards/rejected": -1.6190989017486572, "sft_loss": 1.3885855674743652, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 8.863189403136145, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.13697829842567444, "logits/rejected": -0.04790828004479408, "logps/chosen": -1.3052732944488525, "logps/rejected": -1.7480891942977905, "loss": 1.0019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3052732944488525, "rewards/margins": 0.44281578063964844, "rewards/rejected": -1.7480891942977905, "sft_loss": 1.292776346206665, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 8.138743922757627, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.179933100938797, "logits/rejected": -0.09679999947547913, "logps/chosen": -1.3556548357009888, "logps/rejected": -1.730666160583496, "loss": 1.0524, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3556548357009888, "rewards/margins": 0.3750113844871521, "rewards/rejected": -1.730666160583496, "sft_loss": 1.372294545173645, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 6.969945640232902, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.1762910932302475, "logits/rejected": -0.007267421577125788, "logps/chosen": -1.3951175212860107, "logps/rejected": -1.775962471961975, "loss": 1.0331, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3951175212860107, "rewards/margins": 0.3808448910713196, "rewards/rejected": -1.775962471961975, "sft_loss": 1.431128978729248, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 8.452216105259412, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.25379595160484314, "logits/rejected": -0.07199688255786896, "logps/chosen": -1.2877904176712036, "logps/rejected": -1.5933940410614014, "loss": 1.0154, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2877904176712036, "rewards/margins": 0.30560365319252014, "rewards/rejected": -1.5933940410614014, "sft_loss": 1.3085378408432007, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 6.1146357749691225, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.27868279814720154, "logits/rejected": -0.142146036028862, "logps/chosen": -1.2972919940948486, "logps/rejected": -1.712339162826538, "loss": 1.0044, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2972919940948486, "rewards/margins": 0.41504722833633423, "rewards/rejected": -1.712339162826538, "sft_loss": 1.356175422668457, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 8.680293978801568, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.19600138068199158, "logits/rejected": -0.07686041295528412, "logps/chosen": -1.3629693984985352, "logps/rejected": -1.5587340593338013, "loss": 1.0804, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3629693984985352, "rewards/margins": 0.19576458632946014, "rewards/rejected": -1.5587340593338013, "sft_loss": 1.344761610031128, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 7.528570843500533, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.17637869715690613, "logits/rejected": -0.07687224447727203, "logps/chosen": -1.3604662418365479, "logps/rejected": -1.6803786754608154, "loss": 1.016, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3604662418365479, "rewards/margins": 0.3199126124382019, "rewards/rejected": -1.6803786754608154, "sft_loss": 1.347644567489624, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 10.17870051576461, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.04938278719782829, "logits/rejected": 0.0634743794798851, "logps/chosen": -1.3148540258407593, "logps/rejected": -1.758033037185669, "loss": 1.0271, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3148540258407593, "rewards/margins": 0.4431789815425873, "rewards/rejected": -1.758033037185669, "sft_loss": 1.3470075130462646, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 8.577794751942648, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.14258165657520294, "logits/rejected": -0.015600791200995445, "logps/chosen": -1.3516905307769775, "logps/rejected": -1.742180585861206, "loss": 1.0458, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3516905307769775, "rewards/margins": 0.39049032330513, "rewards/rejected": -1.742180585861206, "sft_loss": 1.3893133401870728, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 8.570238281275206, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.21197304129600525, "logits/rejected": -0.009961167350411415, "logps/chosen": -1.3131115436553955, "logps/rejected": -1.6712608337402344, "loss": 1.017, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3131115436553955, "rewards/margins": 0.3581491708755493, "rewards/rejected": -1.6712608337402344, "sft_loss": 1.3298722505569458, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 10.572661224304408, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.2016395777463913, "logits/rejected": -0.055265843868255615, "logps/chosen": -1.2552399635314941, "logps/rejected": -1.6915838718414307, "loss": 0.9776, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2552399635314941, "rewards/margins": 0.43634381890296936, "rewards/rejected": -1.6915838718414307, "sft_loss": 1.2686001062393188, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 6.548478183198705, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.10608162730932236, "logits/rejected": -0.001629498554393649, "logps/chosen": -1.31504225730896, "logps/rejected": -1.6856321096420288, "loss": 1.0273, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.31504225730896, "rewards/margins": 0.37058982253074646, "rewards/rejected": -1.6856321096420288, "sft_loss": 1.368192434310913, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 10.097095692347898, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.051770079880952835, "logits/rejected": -0.024975869804620743, "logps/chosen": -1.3028028011322021, "logps/rejected": -1.6529500484466553, "loss": 1.0398, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3028028011322021, "rewards/margins": 0.3501472473144531, "rewards/rejected": -1.6529500484466553, "sft_loss": 1.3389087915420532, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 6.663043928077902, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.1299368143081665, "logits/rejected": -0.03927867114543915, "logps/chosen": -1.254652976989746, "logps/rejected": -1.5375810861587524, "loss": 1.0272, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.254652976989746, "rewards/margins": 0.2829279899597168, "rewards/rejected": -1.5375810861587524, "sft_loss": 1.3334118127822876, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 13.30160450099127, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.0906791165471077, "logits/rejected": 0.07906799018383026, "logps/chosen": -1.3736572265625, "logps/rejected": -1.728371024131775, "loss": 1.0719, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3736572265625, "rewards/margins": 0.3547138273715973, "rewards/rejected": -1.728371024131775, "sft_loss": 1.4097964763641357, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 10.224493828222354, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.06458568572998047, "logits/rejected": 0.05855490639805794, "logps/chosen": -1.3306993246078491, "logps/rejected": -1.7111177444458008, "loss": 1.0252, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3306993246078491, "rewards/margins": 0.38041844964027405, "rewards/rejected": -1.7111177444458008, "sft_loss": 1.3561182022094727, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 10.674837251777372, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.09088797867298126, "logits/rejected": -0.03129405155777931, "logps/chosen": -1.2738713026046753, "logps/rejected": -1.697519302368164, "loss": 1.015, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2738713026046753, "rewards/margins": 0.4236481785774231, "rewards/rejected": -1.697519302368164, "sft_loss": 1.2819215059280396, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 7.967714066643323, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.2462097853422165, "logits/rejected": -0.07248953729867935, "logps/chosen": -1.2905464172363281, "logps/rejected": -1.9068615436553955, "loss": 0.9933, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2905464172363281, "rewards/margins": 0.6163150072097778, "rewards/rejected": -1.9068615436553955, "sft_loss": 1.3654937744140625, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 8.802652583026893, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.20253901183605194, "logits/rejected": -0.05758960172533989, "logps/chosen": -1.345850944519043, "logps/rejected": -1.7187354564666748, "loss": 1.0459, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.345850944519043, "rewards/margins": 0.3728848099708557, "rewards/rejected": -1.7187354564666748, "sft_loss": 1.3998357057571411, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 5.930700000256952, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.19098657369613647, "logits/rejected": -0.015033292584121227, "logps/chosen": -1.3140078783035278, "logps/rejected": -1.8780615329742432, "loss": 1.0028, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3140078783035278, "rewards/margins": 0.5640536546707153, "rewards/rejected": -1.8780615329742432, "sft_loss": 1.3845045566558838, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 9.34078243263626, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.17489251494407654, "logits/rejected": -0.025254786014556885, "logps/chosen": -1.3132505416870117, "logps/rejected": -1.6130483150482178, "loss": 1.0468, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3132505416870117, "rewards/margins": 0.2997978925704956, "rewards/rejected": -1.6130483150482178, "sft_loss": 1.3403208255767822, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 33.24780731727735, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.07287373393774033, "logits/rejected": 0.18627944588661194, "logps/chosen": -1.2778263092041016, "logps/rejected": -1.8012363910675049, "loss": 0.9706, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2778263092041016, "rewards/margins": 0.5234102010726929, "rewards/rejected": -1.8012363910675049, "sft_loss": 1.2985767126083374, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 11.258095186099865, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.07001444697380066, "logits/rejected": 0.03678502142429352, "logps/chosen": -1.2881648540496826, "logps/rejected": -1.7102916240692139, "loss": 0.9926, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2881648540496826, "rewards/margins": 0.42212677001953125, "rewards/rejected": -1.7102916240692139, "sft_loss": 1.3350517749786377, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 8.822122841064651, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.1789834201335907, "logits/rejected": -0.04704167693853378, "logps/chosen": -1.3048163652420044, "logps/rejected": -1.6448017358779907, "loss": 1.0211, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3048163652420044, "rewards/margins": 0.339985191822052, "rewards/rejected": -1.6448017358779907, "sft_loss": 1.323728322982788, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 7.59416415046014, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.15828174352645874, "logits/rejected": -0.06027153134346008, "logps/chosen": -1.2840261459350586, "logps/rejected": -1.6559422016143799, "loss": 1.0335, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2840261459350586, "rewards/margins": 0.3719159960746765, "rewards/rejected": -1.6559422016143799, "sft_loss": 1.2959760427474976, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 12.418799474714286, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.19060225784778595, "logits/rejected": -0.008369709365069866, "logps/chosen": -1.3549988269805908, "logps/rejected": -1.6975902318954468, "loss": 1.0852, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3549988269805908, "rewards/margins": 0.34259122610092163, "rewards/rejected": -1.6975902318954468, "sft_loss": 1.4190717935562134, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 7.139956178679369, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.17740900814533234, "logits/rejected": -0.034846335649490356, "logps/chosen": -1.3455655574798584, "logps/rejected": -1.7951133251190186, "loss": 1.0253, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3455655574798584, "rewards/margins": 0.4495477080345154, "rewards/rejected": -1.7951133251190186, "sft_loss": 1.4229328632354736, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 5.886677056137338, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.27010422945022583, "logits/rejected": -0.11675111204385757, "logps/chosen": -1.4220235347747803, "logps/rejected": -1.7936725616455078, "loss": 1.0341, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4220235347747803, "rewards/margins": 0.37164920568466187, "rewards/rejected": -1.7936725616455078, "sft_loss": 1.4209567308425903, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 7.56605036688191, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.2440626621246338, "logits/rejected": -0.06764684617519379, "logps/chosen": -1.4485827684402466, "logps/rejected": -1.7837321758270264, "loss": 1.0784, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4485827684402466, "rewards/margins": 0.3351495563983917, "rewards/rejected": -1.7837321758270264, "sft_loss": 1.3873783349990845, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 19.071359847499636, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.2108345329761505, "logits/rejected": -0.03887351602315903, "logps/chosen": -1.3028171062469482, "logps/rejected": -1.8426485061645508, "loss": 0.9855, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3028171062469482, "rewards/margins": 0.5398311614990234, "rewards/rejected": -1.8426485061645508, "sft_loss": 1.3312140703201294, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 10.569481667131303, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.22008387744426727, "logits/rejected": -0.12119672447443008, "logps/chosen": -1.3868043422698975, "logps/rejected": -1.9178674221038818, "loss": 1.0373, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3868043422698975, "rewards/margins": 0.5310630798339844, "rewards/rejected": -1.9178674221038818, "sft_loss": 1.4592076539993286, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 5.29555931503069, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.2550900876522064, "logits/rejected": -0.11833520233631134, "logps/chosen": -1.4027044773101807, "logps/rejected": -1.7607784271240234, "loss": 1.0605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4027044773101807, "rewards/margins": 0.3580739498138428, "rewards/rejected": -1.7607784271240234, "sft_loss": 1.3712348937988281, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 11.190167224051493, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.26853471994400024, "logits/rejected": -0.12684592604637146, "logps/chosen": -1.2774595022201538, "logps/rejected": -1.6234922409057617, "loss": 1.0173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2774595022201538, "rewards/margins": 0.3460327684879303, "rewards/rejected": -1.6234922409057617, "sft_loss": 1.2910670042037964, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 9.278021212318102, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.2627517282962799, "logits/rejected": -0.13891619443893433, "logps/chosen": -1.315260887145996, "logps/rejected": -1.6367619037628174, "loss": 1.0562, "rewards/accuracies": 0.59375, "rewards/chosen": -1.315260887145996, "rewards/margins": 0.3215009570121765, "rewards/rejected": -1.6367619037628174, "sft_loss": 1.3411436080932617, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 10.036290188631336, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.07215414196252823, "logits/rejected": 0.001811787486076355, "logps/chosen": -1.358033537864685, "logps/rejected": -1.692010521888733, "loss": 1.0435, "rewards/accuracies": 0.625, "rewards/chosen": -1.358033537864685, "rewards/margins": 0.3339768946170807, "rewards/rejected": -1.692010521888733, "sft_loss": 1.3851827383041382, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 8.720723133324936, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.2639918327331543, "logits/rejected": -0.11108319461345673, "logps/chosen": -1.295902967453003, "logps/rejected": -1.6099973917007446, "loss": 1.0365, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.295902967453003, "rewards/margins": 0.3140943646430969, "rewards/rejected": -1.6099973917007446, "sft_loss": 1.3766124248504639, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 7.357840446224466, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.19443385303020477, "logits/rejected": -0.05075268819928169, "logps/chosen": -1.2535735368728638, "logps/rejected": -1.5689475536346436, "loss": 1.0252, "rewards/accuracies": 0.625, "rewards/chosen": -1.2535735368728638, "rewards/margins": 0.3153740167617798, "rewards/rejected": -1.5689475536346436, "sft_loss": 1.3399714231491089, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 6.970022638713376, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.168025940656662, "logits/rejected": -0.02265249192714691, "logps/chosen": -1.3693130016326904, "logps/rejected": -1.6871531009674072, "loss": 1.0515, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3693130016326904, "rewards/margins": 0.31784000992774963, "rewards/rejected": -1.6871531009674072, "sft_loss": 1.3919140100479126, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 10.504597847545845, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.0708565041422844, "logits/rejected": 0.0453130379319191, "logps/chosen": -1.325622797012329, "logps/rejected": -1.514797568321228, "loss": 1.0581, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.325622797012329, "rewards/margins": 0.18917487561702728, "rewards/rejected": -1.514797568321228, "sft_loss": 1.327514410018921, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 8.09179800215906, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.18696202337741852, "logits/rejected": -0.12388841807842255, "logps/chosen": -1.2330793142318726, "logps/rejected": -1.7086670398712158, "loss": 0.9741, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2330793142318726, "rewards/margins": 0.4755876660346985, "rewards/rejected": -1.7086670398712158, "sft_loss": 1.2789698839187622, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 12.564328773720826, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.19994398951530457, "logits/rejected": -0.10410158336162567, "logps/chosen": -1.267035961151123, "logps/rejected": -1.5420608520507812, "loss": 1.0454, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.267035961151123, "rewards/margins": 0.27502498030662537, "rewards/rejected": -1.5420608520507812, "sft_loss": 1.2928426265716553, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 6.7703373107950044, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.22879798710346222, "logits/rejected": -0.07329122722148895, "logps/chosen": -1.3567241430282593, "logps/rejected": -1.8353170156478882, "loss": 1.0171, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3567241430282593, "rewards/margins": 0.47859278321266174, "rewards/rejected": -1.8353170156478882, "sft_loss": 1.4154313802719116, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 8.358972041931533, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.23752336204051971, "logits/rejected": -0.10456007719039917, "logps/chosen": -1.3062692880630493, "logps/rejected": -1.7325499057769775, "loss": 0.998, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3062692880630493, "rewards/margins": 0.4262804090976715, "rewards/rejected": -1.7325499057769775, "sft_loss": 1.3278343677520752, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 5.687794422303738, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.22338104248046875, "logits/rejected": -0.05207739397883415, "logps/chosen": -1.3390872478485107, "logps/rejected": -1.688347578048706, "loss": 1.0552, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3390872478485107, "rewards/margins": 0.349260538816452, "rewards/rejected": -1.688347578048706, "sft_loss": 1.382442831993103, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 10.26839397605355, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.12670452892780304, "logits/rejected": -0.08819358795881271, "logps/chosen": -1.3337876796722412, "logps/rejected": -1.8196563720703125, "loss": 1.0039, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3337876796722412, "rewards/margins": 0.4858686327934265, "rewards/rejected": -1.8196563720703125, "sft_loss": 1.370620608329773, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 8.743863475193205, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.25462573766708374, "logits/rejected": -0.137288436293602, "logps/chosen": -1.3446637392044067, "logps/rejected": -1.7400169372558594, "loss": 1.0353, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3446637392044067, "rewards/margins": 0.395353227853775, "rewards/rejected": -1.7400169372558594, "sft_loss": 1.392261266708374, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 6.524113767489526, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.2823813259601593, "logits/rejected": -0.21783390641212463, "logps/chosen": -1.274153232574463, "logps/rejected": -1.6759366989135742, "loss": 0.9917, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.274153232574463, "rewards/margins": 0.40178337693214417, "rewards/rejected": -1.6759366989135742, "sft_loss": 1.2710778713226318, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 7.768779479863598, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.33525434136390686, "logits/rejected": -0.15368661284446716, "logps/chosen": -1.2215659618377686, "logps/rejected": -1.6280514001846313, "loss": 0.9814, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2215659618377686, "rewards/margins": 0.406485378742218, "rewards/rejected": -1.6280514001846313, "sft_loss": 1.2901760339736938, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 5.962837403904988, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.14228685200214386, "logits/rejected": -0.13161325454711914, "logps/chosen": -1.2799794673919678, "logps/rejected": -1.6704368591308594, "loss": 0.9987, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2799794673919678, "rewards/margins": 0.3904576301574707, "rewards/rejected": -1.6704368591308594, "sft_loss": 1.3193469047546387, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 7.4051418349634, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.1406664401292801, "logits/rejected": -0.0699944943189621, "logps/chosen": -1.320504903793335, "logps/rejected": -1.5901587009429932, "loss": 1.0914, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.320504903793335, "rewards/margins": 0.26965343952178955, "rewards/rejected": -1.5901587009429932, "sft_loss": 1.3954423666000366, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 6.757034888848797, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.18777263164520264, "logits/rejected": -0.04904399812221527, "logps/chosen": -1.3198715448379517, "logps/rejected": -1.7347841262817383, "loss": 1.003, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3198715448379517, "rewards/margins": 0.4149126410484314, "rewards/rejected": -1.7347841262817383, "sft_loss": 1.34419846534729, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 10.260517271328002, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.336527019739151, "logits/rejected": -0.20288416743278503, "logps/chosen": -1.3640118837356567, "logps/rejected": -1.6893068552017212, "loss": 1.0775, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3640118837356567, "rewards/margins": 0.32529503107070923, "rewards/rejected": -1.6893068552017212, "sft_loss": 1.4524770975112915, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 11.022266750671784, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.27054914832115173, "logits/rejected": -0.0914614200592041, "logps/chosen": -1.3564226627349854, "logps/rejected": -1.6926740407943726, "loss": 1.0388, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3564226627349854, "rewards/margins": 0.33625149726867676, "rewards/rejected": -1.6926740407943726, "sft_loss": 1.4034146070480347, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 10.413816986747639, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.19406765699386597, "logits/rejected": -0.09133412688970566, "logps/chosen": -1.3404884338378906, "logps/rejected": -1.7077680826187134, "loss": 1.0173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3404884338378906, "rewards/margins": 0.36727944016456604, "rewards/rejected": -1.7077680826187134, "sft_loss": 1.2941471338272095, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 8.872808421935437, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.10099422931671143, "logits/rejected": -0.016091059893369675, "logps/chosen": -1.3404185771942139, "logps/rejected": -1.7624202966690063, "loss": 1.0256, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3404185771942139, "rewards/margins": 0.4220016598701477, "rewards/rejected": -1.7624202966690063, "sft_loss": 1.4071005582809448, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 6.796383293872087, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.2519132196903229, "logits/rejected": -0.10128184407949448, "logps/chosen": -1.403340220451355, "logps/rejected": -1.7736599445343018, "loss": 1.0702, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.403340220451355, "rewards/margins": 0.370319664478302, "rewards/rejected": -1.7736599445343018, "sft_loss": 1.4098050594329834, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 5.748312285692267, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.15138813853263855, "logits/rejected": -0.04131780564785004, "logps/chosen": -1.4313924312591553, "logps/rejected": -1.8283259868621826, "loss": 1.037, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4313924312591553, "rewards/margins": 0.39693355560302734, "rewards/rejected": -1.8283259868621826, "sft_loss": 1.3752673864364624, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 8.76407108118166, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.15335145592689514, "logits/rejected": -0.016062330454587936, "logps/chosen": -1.3842003345489502, "logps/rejected": -1.7323811054229736, "loss": 1.0399, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3842003345489502, "rewards/margins": 0.3481805920600891, "rewards/rejected": -1.7323811054229736, "sft_loss": 1.4326179027557373, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 6.8240954594636385, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.10055909305810928, "logits/rejected": -0.03672239929437637, "logps/chosen": -1.3982130289077759, "logps/rejected": -1.7273247241973877, "loss": 1.0455, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3982130289077759, "rewards/margins": 0.3291115462779999, "rewards/rejected": -1.7273247241973877, "sft_loss": 1.3380857706069946, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 5.681251133368896, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.38894954323768616, "logits/rejected": -0.23619875311851501, "logps/chosen": -1.250199317932129, "logps/rejected": -1.519134283065796, "loss": 1.0452, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.250199317932129, "rewards/margins": 0.26893505454063416, "rewards/rejected": -1.519134283065796, "sft_loss": 1.3250980377197266, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 8.042233248752146, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.1970815658569336, "logits/rejected": -0.060459040105342865, "logps/chosen": -1.294335961341858, "logps/rejected": -1.7021186351776123, "loss": 1.0417, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.294335961341858, "rewards/margins": 0.407782644033432, "rewards/rejected": -1.7021186351776123, "sft_loss": 1.3637042045593262, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 9.010907080241779, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.33565548062324524, "logits/rejected": -0.21785983443260193, "logps/chosen": -1.3821521997451782, "logps/rejected": -1.7122135162353516, "loss": 1.0358, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3821521997451782, "rewards/margins": 0.33006131649017334, "rewards/rejected": -1.7122135162353516, "sft_loss": 1.374982237815857, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 9.202178181845822, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.31604892015457153, "logits/rejected": -0.2019950896501541, "logps/chosen": -1.2382168769836426, "logps/rejected": -1.6578218936920166, "loss": 0.9991, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2382168769836426, "rewards/margins": 0.41960495710372925, "rewards/rejected": -1.6578218936920166, "sft_loss": 1.2910674810409546, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 8.42016279895261, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.26183784008026123, "logits/rejected": -0.24288320541381836, "logps/chosen": -1.3160405158996582, "logps/rejected": -1.7478888034820557, "loss": 1.0328, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3160405158996582, "rewards/margins": 0.4318482279777527, "rewards/rejected": -1.7478888034820557, "sft_loss": 1.3997399806976318, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 5.810263975580805, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.3238453269004822, "logits/rejected": -0.23669204115867615, "logps/chosen": -1.3417284488677979, "logps/rejected": -1.6320661306381226, "loss": 1.0733, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3417284488677979, "rewards/margins": 0.2903375029563904, "rewards/rejected": -1.6320661306381226, "sft_loss": 1.40792715549469, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 7.031312718505899, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.2895796000957489, "logits/rejected": -0.193088099360466, "logps/chosen": -1.3276104927062988, "logps/rejected": -1.6946378946304321, "loss": 1.0454, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3276104927062988, "rewards/margins": 0.36702749133110046, "rewards/rejected": -1.6946378946304321, "sft_loss": 1.3769690990447998, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 11.568266998719043, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.31578174233436584, "logits/rejected": -0.22001910209655762, "logps/chosen": -1.3103505373001099, "logps/rejected": -1.603722333908081, "loss": 1.0611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3103505373001099, "rewards/margins": 0.29337188601493835, "rewards/rejected": -1.603722333908081, "sft_loss": 1.345990538597107, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 6.16136392232105, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.1511804759502411, "logits/rejected": -0.0974348783493042, "logps/chosen": -1.3440520763397217, "logps/rejected": -1.8585773706436157, "loss": 1.0114, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3440520763397217, "rewards/margins": 0.5145252346992493, "rewards/rejected": -1.8585773706436157, "sft_loss": 1.354104995727539, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 7.683076853167783, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.20450139045715332, "logits/rejected": -0.1865270435810089, "logps/chosen": -1.297811508178711, "logps/rejected": -1.5590054988861084, "loss": 1.0452, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.297811508178711, "rewards/margins": 0.2611939609050751, "rewards/rejected": -1.5590054988861084, "sft_loss": 1.3574228286743164, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 7.340556040477444, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.2812632918357849, "logits/rejected": -0.1310562640428543, "logps/chosen": -1.2653824090957642, "logps/rejected": -1.74893057346344, "loss": 0.9541, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2653824090957642, "rewards/margins": 0.483548104763031, "rewards/rejected": -1.74893057346344, "sft_loss": 1.2568496465682983, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 8.825030310410854, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.2938292920589447, "logits/rejected": -0.1903020441532135, "logps/chosen": -1.2760530710220337, "logps/rejected": -1.7193348407745361, "loss": 0.9749, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2760530710220337, "rewards/margins": 0.4432816505432129, "rewards/rejected": -1.7193348407745361, "sft_loss": 1.2668559551239014, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 7.873635336903305, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.19705058634281158, "logits/rejected": -0.06154204532504082, "logps/chosen": -1.3533506393432617, "logps/rejected": -1.8990510702133179, "loss": 1.0196, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3533506393432617, "rewards/margins": 0.5457005500793457, "rewards/rejected": -1.8990510702133179, "sft_loss": 1.3797739744186401, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.10069998353719711, "eval_logits/rejected": 0.18723244965076447, "eval_logps/chosen": -1.361523985862732, "eval_logps/rejected": -1.7338141202926636, "eval_loss": 1.0449482202529907, "eval_rewards/accuracies": 0.6142433285713196, "eval_rewards/chosen": -1.361523985862732, "eval_rewards/margins": 0.37229031324386597, "eval_rewards/rejected": -1.7338141202926636, "eval_runtime": 43.4483, "eval_samples_per_second": 30.956, "eval_sft_loss": 1.384097933769226, "eval_steps_per_second": 7.756, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 6.844268168714967, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.3229488432407379, "logits/rejected": -0.20911166071891785, "logps/chosen": -1.3647921085357666, "logps/rejected": -1.7320101261138916, "loss": 1.0377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3647921085357666, "rewards/margins": 0.3672178387641907, "rewards/rejected": -1.7320101261138916, "sft_loss": 1.368546485900879, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 8.115234289434698, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.28309884667396545, "logits/rejected": -0.18161797523498535, "logps/chosen": -1.3209306001663208, "logps/rejected": -1.6985241174697876, "loss": 1.0407, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3209306001663208, "rewards/margins": 0.377593457698822, "rewards/rejected": -1.6985241174697876, "sft_loss": 1.3438318967819214, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 8.894952523029621, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.26940733194351196, "logits/rejected": -0.14569416642189026, "logps/chosen": -1.3172519207000732, "logps/rejected": -1.7098270654678345, "loss": 1.029, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3172519207000732, "rewards/margins": 0.39257511496543884, "rewards/rejected": -1.7098270654678345, "sft_loss": 1.3400871753692627, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 8.060961097393376, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.24872052669525146, "logits/rejected": -0.03148692101240158, "logps/chosen": -1.4487375020980835, "logps/rejected": -1.8613462448120117, "loss": 1.0828, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4487375020980835, "rewards/margins": 0.4126088619232178, "rewards/rejected": -1.8613462448120117, "sft_loss": 1.4077409505844116, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 8.652416739646052, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.24346446990966797, "logits/rejected": -0.10656796395778656, "logps/chosen": -1.257863998413086, "logps/rejected": -1.7140146493911743, "loss": 0.9896, "rewards/accuracies": 0.65625, "rewards/chosen": -1.257863998413086, "rewards/margins": 0.456150621175766, "rewards/rejected": -1.7140146493911743, "sft_loss": 1.3103606700897217, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 7.284827283113485, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.2828959822654724, "logits/rejected": -0.13060639798641205, "logps/chosen": -1.3334567546844482, "logps/rejected": -1.8920681476593018, "loss": 0.9674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3334567546844482, "rewards/margins": 0.558611273765564, "rewards/rejected": -1.8920681476593018, "sft_loss": 1.3370991945266724, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 6.933979823213048, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.26814383268356323, "logits/rejected": -0.20636215806007385, "logps/chosen": -1.3193995952606201, "logps/rejected": -1.7179428339004517, "loss": 1.0148, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3193995952606201, "rewards/margins": 0.3985433876514435, "rewards/rejected": -1.7179428339004517, "sft_loss": 1.3590309619903564, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 7.415555323593706, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.18289199471473694, "logits/rejected": -0.08508212119340897, "logps/chosen": -1.292557954788208, "logps/rejected": -1.7914676666259766, "loss": 0.9544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.292557954788208, "rewards/margins": 0.4989096522331238, "rewards/rejected": -1.7914676666259766, "sft_loss": 1.2400840520858765, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 14.699238080869227, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.35700684785842896, "logits/rejected": -0.21485145390033722, "logps/chosen": -1.333528995513916, "logps/rejected": -1.781053900718689, "loss": 1.0136, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.333528995513916, "rewards/margins": 0.4475248456001282, "rewards/rejected": -1.781053900718689, "sft_loss": 1.3286097049713135, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 9.524357067386456, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.13468368351459503, "logits/rejected": 0.0022729337215423584, "logps/chosen": -1.3335119485855103, "logps/rejected": -1.7451871633529663, "loss": 1.0217, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3335119485855103, "rewards/margins": 0.4116753041744232, "rewards/rejected": -1.7451871633529663, "sft_loss": 1.3561052083969116, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 6.71190547951929, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.17234370112419128, "logits/rejected": -0.09425880759954453, "logps/chosen": -1.3979463577270508, "logps/rejected": -1.6748193502426147, "loss": 1.0709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3979463577270508, "rewards/margins": 0.27687305212020874, "rewards/rejected": -1.6748193502426147, "sft_loss": 1.3898849487304688, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 7.512455926222893, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.30011698603630066, "logits/rejected": -0.16482755541801453, "logps/chosen": -1.3577678203582764, "logps/rejected": -1.6450729370117188, "loss": 1.0584, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3577678203582764, "rewards/margins": 0.28730517625808716, "rewards/rejected": -1.6450729370117188, "sft_loss": 1.364136815071106, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 10.350971747439411, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.19551385939121246, "logits/rejected": -0.09423010051250458, "logps/chosen": -1.283046841621399, "logps/rejected": -1.7449703216552734, "loss": 0.9992, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.283046841621399, "rewards/margins": 0.4619235396385193, "rewards/rejected": -1.7449703216552734, "sft_loss": 1.334883689880371, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 7.971566899292058, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.19808469712734222, "logits/rejected": -0.051246147602796555, "logps/chosen": -1.2304213047027588, "logps/rejected": -1.6856091022491455, "loss": 0.975, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2304213047027588, "rewards/margins": 0.4551876485347748, "rewards/rejected": -1.6856091022491455, "sft_loss": 1.2523653507232666, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 6.508714328333814, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.25844448804855347, "logits/rejected": 0.04663591459393501, "logps/chosen": -1.3295787572860718, "logps/rejected": -1.7243293523788452, "loss": 1.0324, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3295787572860718, "rewards/margins": 0.3947505056858063, "rewards/rejected": -1.7243293523788452, "sft_loss": 1.3863446712493896, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 12.375952625326908, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.2323465794324875, "logits/rejected": -0.14086218178272247, "logps/chosen": -1.2785847187042236, "logps/rejected": -1.6352514028549194, "loss": 1.0352, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2785847187042236, "rewards/margins": 0.3566668629646301, "rewards/rejected": -1.6352514028549194, "sft_loss": 1.3782927989959717, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 7.149176077230483, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.25271302461624146, "logits/rejected": -0.04393969476222992, "logps/chosen": -1.289284586906433, "logps/rejected": -1.7105737924575806, "loss": 1.0153, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.289284586906433, "rewards/margins": 0.4212891459465027, "rewards/rejected": -1.7105737924575806, "sft_loss": 1.3399940729141235, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 9.483763943800556, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.18309524655342102, "logits/rejected": -0.07059819251298904, "logps/chosen": -1.335192084312439, "logps/rejected": -1.6796214580535889, "loss": 1.0525, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.335192084312439, "rewards/margins": 0.3444294333457947, "rewards/rejected": -1.6796214580535889, "sft_loss": 1.3960546255111694, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 9.271513968041686, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.2182222157716751, "logits/rejected": -0.14940352737903595, "logps/chosen": -1.442674994468689, "logps/rejected": -1.7633249759674072, "loss": 1.1113, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.442674994468689, "rewards/margins": 0.32064980268478394, "rewards/rejected": -1.7633249759674072, "sft_loss": 1.4603216648101807, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 7.3965118387087525, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.31792744994163513, "logits/rejected": -0.17529422044754028, "logps/chosen": -1.4276206493377686, "logps/rejected": -1.8709055185317993, "loss": 1.0613, "rewards/accuracies": 0.625, "rewards/chosen": -1.4276206493377686, "rewards/margins": 0.44328489899635315, "rewards/rejected": -1.8709055185317993, "sft_loss": 1.4553303718566895, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 11.12920257714004, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.2868819534778595, "logits/rejected": -0.1467498242855072, "logps/chosen": -1.3745849132537842, "logps/rejected": -1.7479822635650635, "loss": 1.0532, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3745849132537842, "rewards/margins": 0.3733974099159241, "rewards/rejected": -1.7479822635650635, "sft_loss": 1.378862977027893, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 7.334585852212527, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.12723413109779358, "logits/rejected": -0.08206866681575775, "logps/chosen": -1.3312435150146484, "logps/rejected": -1.708142876625061, "loss": 1.0238, "rewards/accuracies": 0.625, "rewards/chosen": -1.3312435150146484, "rewards/margins": 0.37689924240112305, "rewards/rejected": -1.708142876625061, "sft_loss": 1.3588409423828125, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 13.909124891292313, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.32031458616256714, "logits/rejected": -0.17457079887390137, "logps/chosen": -1.3311392068862915, "logps/rejected": -1.7172855138778687, "loss": 1.0166, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3311392068862915, "rewards/margins": 0.38614630699157715, "rewards/rejected": -1.7172855138778687, "sft_loss": 1.3369089365005493, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 7.115716742325556, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.22326748073101044, "logits/rejected": -0.035021353513002396, "logps/chosen": -1.3363620042800903, "logps/rejected": -1.958900809288025, "loss": 0.995, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3363620042800903, "rewards/margins": 0.6225385665893555, "rewards/rejected": -1.958900809288025, "sft_loss": 1.3878682851791382, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 8.268366165173441, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.28518885374069214, "logits/rejected": -0.10537783056497574, "logps/chosen": -1.3669929504394531, "logps/rejected": -1.8190317153930664, "loss": 1.0318, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3669929504394531, "rewards/margins": 0.4520387053489685, "rewards/rejected": -1.8190317153930664, "sft_loss": 1.4433027505874634, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 7.6136076819743, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.1993323266506195, "logits/rejected": -0.09304220974445343, "logps/chosen": -1.283071756362915, "logps/rejected": -1.7913768291473389, "loss": 0.9885, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.283071756362915, "rewards/margins": 0.5083053112030029, "rewards/rejected": -1.7913768291473389, "sft_loss": 1.3137251138687134, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 13.182307661983321, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.22208702564239502, "logits/rejected": -0.06606097519397736, "logps/chosen": -1.3824620246887207, "logps/rejected": -1.769325852394104, "loss": 1.026, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3824620246887207, "rewards/margins": 0.3868637979030609, "rewards/rejected": -1.769325852394104, "sft_loss": 1.3636934757232666, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 7.351513845749289, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.1672711819410324, "logits/rejected": -0.013500380329787731, "logps/chosen": -1.2844191789627075, "logps/rejected": -1.7520344257354736, "loss": 1.0129, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2844191789627075, "rewards/margins": 0.4676152765750885, "rewards/rejected": -1.7520344257354736, "sft_loss": 1.3409457206726074, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 6.964401966409443, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.16246755421161652, "logits/rejected": 0.018709395080804825, "logps/chosen": -1.2220256328582764, "logps/rejected": -1.6237547397613525, "loss": 0.9836, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2220256328582764, "rewards/margins": 0.40172892808914185, "rewards/rejected": -1.6237547397613525, "sft_loss": 1.2745763063430786, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 8.104077888153768, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.33674901723861694, "logits/rejected": -0.18985441327095032, "logps/chosen": -1.391183614730835, "logps/rejected": -1.7601451873779297, "loss": 1.0466, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.391183614730835, "rewards/margins": 0.36896148324012756, "rewards/rejected": -1.7601451873779297, "sft_loss": 1.4082539081573486, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 8.728384432383228, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.11587512493133545, "logits/rejected": -0.07313670217990875, "logps/chosen": -1.419593334197998, "logps/rejected": -1.6886402368545532, "loss": 1.1363, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.419593334197998, "rewards/margins": 0.26904693245887756, "rewards/rejected": -1.6886402368545532, "sft_loss": 1.4004647731781006, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 8.470114959808182, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.2647283673286438, "logits/rejected": -0.06007467582821846, "logps/chosen": -1.2760438919067383, "logps/rejected": -1.6822011470794678, "loss": 1.0078, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2760438919067383, "rewards/margins": 0.4061572551727295, "rewards/rejected": -1.6822011470794678, "sft_loss": 1.301793098449707, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 7.793389012007866, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.2339763194322586, "logits/rejected": -0.06817017495632172, "logps/chosen": -1.318159818649292, "logps/rejected": -1.762577772140503, "loss": 0.9987, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.318159818649292, "rewards/margins": 0.444417804479599, "rewards/rejected": -1.762577772140503, "sft_loss": 1.3555116653442383, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 10.438996597528387, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.20583009719848633, "logits/rejected": -0.045845162123441696, "logps/chosen": -1.356227159500122, "logps/rejected": -1.6683311462402344, "loss": 1.0769, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.356227159500122, "rewards/margins": 0.31210413575172424, "rewards/rejected": -1.6683311462402344, "sft_loss": 1.351169228553772, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 12.764675577748738, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.07709778100252151, "logits/rejected": -0.026763681322336197, "logps/chosen": -1.3959242105484009, "logps/rejected": -1.683182954788208, "loss": 1.0913, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3959242105484009, "rewards/margins": 0.2872585356235504, "rewards/rejected": -1.683182954788208, "sft_loss": 1.3707129955291748, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 5.923485220700442, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.2798308730125427, "logits/rejected": -0.13452866673469543, "logps/chosen": -1.3274872303009033, "logps/rejected": -1.761406660079956, "loss": 0.983, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3274872303009033, "rewards/margins": 0.4339195787906647, "rewards/rejected": -1.761406660079956, "sft_loss": 1.3321391344070435, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 7.301258798639009, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.2560671865940094, "logits/rejected": -0.22414672374725342, "logps/chosen": -1.3753451108932495, "logps/rejected": -1.6603202819824219, "loss": 1.067, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3753451108932495, "rewards/margins": 0.28497499227523804, "rewards/rejected": -1.6603202819824219, "sft_loss": 1.3981544971466064, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 9.792834224375655, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.1899821162223816, "logits/rejected": -0.06347040086984634, "logps/chosen": -1.397888422012329, "logps/rejected": -1.7293627262115479, "loss": 1.0746, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.397888422012329, "rewards/margins": 0.3314744234085083, "rewards/rejected": -1.7293627262115479, "sft_loss": 1.4246985912322998, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 9.254911328138565, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.23466309905052185, "logits/rejected": -0.12651152908802032, "logps/chosen": -1.2661628723144531, "logps/rejected": -1.7019329071044922, "loss": 0.9972, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2661628723144531, "rewards/margins": 0.43577009439468384, "rewards/rejected": -1.7019329071044922, "sft_loss": 1.2796809673309326, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 8.433387175795398, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.2272816002368927, "logits/rejected": -0.11349409818649292, "logps/chosen": -1.3540637493133545, "logps/rejected": -1.7329727411270142, "loss": 1.0411, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3540637493133545, "rewards/margins": 0.3789089322090149, "rewards/rejected": -1.7329727411270142, "sft_loss": 1.3797038793563843, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 9.748980739349928, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.22415952384471893, "logits/rejected": -0.07019458711147308, "logps/chosen": -1.3057830333709717, "logps/rejected": -1.663480520248413, "loss": 1.0315, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3057830333709717, "rewards/margins": 0.35769766569137573, "rewards/rejected": -1.663480520248413, "sft_loss": 1.3861303329467773, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 14.291251597228019, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.23121888935565948, "logits/rejected": -0.008912255987524986, "logps/chosen": -1.4583107233047485, "logps/rejected": -1.7844129800796509, "loss": 1.0807, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4583107233047485, "rewards/margins": 0.32610228657722473, "rewards/rejected": -1.7844129800796509, "sft_loss": 1.458487629890442, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 9.983326050698938, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.23669393360614777, "logits/rejected": -0.13350990414619446, "logps/chosen": -1.3569681644439697, "logps/rejected": -1.793971061706543, "loss": 1.0534, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3569681644439697, "rewards/margins": 0.4370030462741852, "rewards/rejected": -1.793971061706543, "sft_loss": 1.4427287578582764, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 20.06942241202815, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.17605897784233093, "logits/rejected": -0.05872791260480881, "logps/chosen": -1.4228013753890991, "logps/rejected": -1.9332406520843506, "loss": 1.0655, "rewards/accuracies": 0.625, "rewards/chosen": -1.4228013753890991, "rewards/margins": 0.5104393362998962, "rewards/rejected": -1.9332406520843506, "sft_loss": 1.4420769214630127, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 6.966730048920975, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.26047974824905396, "logits/rejected": -0.21100255846977234, "logps/chosen": -1.349595546722412, "logps/rejected": -1.765178918838501, "loss": 1.0365, "rewards/accuracies": 0.625, "rewards/chosen": -1.349595546722412, "rewards/margins": 0.41558337211608887, "rewards/rejected": -1.765178918838501, "sft_loss": 1.3722059726715088, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 8.473349423749175, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.2327476292848587, "logits/rejected": -0.10556666553020477, "logps/chosen": -1.3813903331756592, "logps/rejected": -1.886596441268921, "loss": 1.035, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3813903331756592, "rewards/margins": 0.5052059888839722, "rewards/rejected": -1.886596441268921, "sft_loss": 1.3991137742996216, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 10.035402052308962, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.23867163062095642, "logits/rejected": -0.15210063755512238, "logps/chosen": -1.3119127750396729, "logps/rejected": -1.7596677541732788, "loss": 0.9875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3119127750396729, "rewards/margins": 0.44775494933128357, "rewards/rejected": -1.7596677541732788, "sft_loss": 1.3625072240829468, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 8.330575167502747, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.3020384907722473, "logits/rejected": -0.23150360584259033, "logps/chosen": -1.2680355310440063, "logps/rejected": -1.8124500513076782, "loss": 0.9956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2680355310440063, "rewards/margins": 0.5444144010543823, "rewards/rejected": -1.8124500513076782, "sft_loss": 1.2984832525253296, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 6.589495318347472, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.26342055201530457, "logits/rejected": -0.03827213495969772, "logps/chosen": -1.3178520202636719, "logps/rejected": -1.7128499746322632, "loss": 1.0207, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3178520202636719, "rewards/margins": 0.3949979543685913, "rewards/rejected": -1.7128499746322632, "sft_loss": 1.363925814628601, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 8.820902723158103, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.35779404640197754, "logits/rejected": -0.17605280876159668, "logps/chosen": -1.3372957706451416, "logps/rejected": -1.8262310028076172, "loss": 1.0258, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3372957706451416, "rewards/margins": 0.48893508315086365, "rewards/rejected": -1.8262310028076172, "sft_loss": 1.4235930442810059, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 10.074634622342717, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.21685686707496643, "logits/rejected": -0.05704827979207039, "logps/chosen": -1.5073444843292236, "logps/rejected": -1.9315385818481445, "loss": 1.0891, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5073444843292236, "rewards/margins": 0.42419394850730896, "rewards/rejected": -1.9315385818481445, "sft_loss": 1.5247254371643066, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 8.722866655496768, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.11254153400659561, "logits/rejected": 0.011176636442542076, "logps/chosen": -1.3440258502960205, "logps/rejected": -1.7505595684051514, "loss": 1.052, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3440258502960205, "rewards/margins": 0.4065338969230652, "rewards/rejected": -1.7505595684051514, "sft_loss": 1.385436773300171, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 6.386705264362839, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.29518577456474304, "logits/rejected": -0.17219959199428558, "logps/chosen": -1.2693736553192139, "logps/rejected": -1.5777580738067627, "loss": 1.0049, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2693736553192139, "rewards/margins": 0.30838438868522644, "rewards/rejected": -1.5777580738067627, "sft_loss": 1.3014901876449585, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 12.161518236812636, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.2759206295013428, "logits/rejected": -0.17716960608959198, "logps/chosen": -1.426809549331665, "logps/rejected": -1.799734354019165, "loss": 1.0697, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.426809549331665, "rewards/margins": 0.3729247450828552, "rewards/rejected": -1.799734354019165, "sft_loss": 1.4560751914978027, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 7.8248066129413365, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.3126507103443146, "logits/rejected": -0.24135151505470276, "logps/chosen": -1.3244651556015015, "logps/rejected": -1.6968772411346436, "loss": 1.0154, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3244651556015015, "rewards/margins": 0.37241214513778687, "rewards/rejected": -1.6968772411346436, "sft_loss": 1.3305847644805908, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 7.08266825370226, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.16913610696792603, "logits/rejected": -0.13287656009197235, "logps/chosen": -1.357891321182251, "logps/rejected": -1.7254955768585205, "loss": 1.031, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.357891321182251, "rewards/margins": 0.36760419607162476, "rewards/rejected": -1.7254955768585205, "sft_loss": 1.3800822496414185, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 7.992211502710734, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.20583942532539368, "logits/rejected": -0.09783850610256195, "logps/chosen": -1.3103139400482178, "logps/rejected": -1.6683019399642944, "loss": 1.0259, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3103139400482178, "rewards/margins": 0.3579878509044647, "rewards/rejected": -1.6683019399642944, "sft_loss": 1.3723644018173218, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 9.146193231875682, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.16054293513298035, "logits/rejected": -0.02962280437350273, "logps/chosen": -1.341709017753601, "logps/rejected": -1.5948436260223389, "loss": 1.0828, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.341709017753601, "rewards/margins": 0.253134548664093, "rewards/rejected": -1.5948436260223389, "sft_loss": 1.4336766004562378, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 7.863417785744701, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.2692742943763733, "logits/rejected": -0.10102590173482895, "logps/chosen": -1.3230074644088745, "logps/rejected": -1.6514161825180054, "loss": 1.0239, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3230074644088745, "rewards/margins": 0.32840877771377563, "rewards/rejected": -1.6514161825180054, "sft_loss": 1.3536341190338135, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 9.644019735561926, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.19013360142707825, "logits/rejected": -0.03948694467544556, "logps/chosen": -1.3543713092803955, "logps/rejected": -1.8687629699707031, "loss": 1.0206, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3543713092803955, "rewards/margins": 0.5143915414810181, "rewards/rejected": -1.8687629699707031, "sft_loss": 1.4165542125701904, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 11.96224025240736, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.1905595362186432, "logits/rejected": -0.060992609709501266, "logps/chosen": -1.377895474433899, "logps/rejected": -1.8040307760238647, "loss": 1.034, "rewards/accuracies": 0.59375, "rewards/chosen": -1.377895474433899, "rewards/margins": 0.42613524198532104, "rewards/rejected": -1.8040307760238647, "sft_loss": 1.4206275939941406, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 13.195637439863178, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.2208903580904007, "logits/rejected": -0.140710711479187, "logps/chosen": -1.3023154735565186, "logps/rejected": -1.6787493228912354, "loss": 1.0144, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3023154735565186, "rewards/margins": 0.3764336109161377, "rewards/rejected": -1.6787493228912354, "sft_loss": 1.3435550928115845, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 11.526309708973256, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.1962347775697708, "logits/rejected": 0.003413937985897064, "logps/chosen": -1.3440295457839966, "logps/rejected": -1.7574107646942139, "loss": 1.0204, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3440295457839966, "rewards/margins": 0.4133811593055725, "rewards/rejected": -1.7574107646942139, "sft_loss": 1.3372437953948975, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 7.910211511126699, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.2631233334541321, "logits/rejected": -0.11768689006567001, "logps/chosen": -1.3593801259994507, "logps/rejected": -1.8382713794708252, "loss": 1.0193, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3593801259994507, "rewards/margins": 0.4788913130760193, "rewards/rejected": -1.8382713794708252, "sft_loss": 1.4099977016448975, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 8.743071655576298, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.16788813471794128, "logits/rejected": 0.0290222205221653, "logps/chosen": -1.3322017192840576, "logps/rejected": -1.7500145435333252, "loss": 1.0442, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3322017192840576, "rewards/margins": 0.4178128242492676, "rewards/rejected": -1.7500145435333252, "sft_loss": 1.411254644393921, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 10.166980185767446, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.17637817561626434, "logits/rejected": -0.1042185053229332, "logps/chosen": -1.307114601135254, "logps/rejected": -1.732693076133728, "loss": 1.0151, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.307114601135254, "rewards/margins": 0.42557865381240845, "rewards/rejected": -1.732693076133728, "sft_loss": 1.3863542079925537, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 12.139404913555175, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.2735113501548767, "logits/rejected": -0.131699800491333, "logps/chosen": -1.265926718711853, "logps/rejected": -1.67597234249115, "loss": 1.0242, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.265926718711853, "rewards/margins": 0.410045862197876, "rewards/rejected": -1.67597234249115, "sft_loss": 1.325702428817749, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 8.370663556151582, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.222178652882576, "logits/rejected": 0.014582176692783833, "logps/chosen": -1.3194665908813477, "logps/rejected": -1.6473604440689087, "loss": 1.0412, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3194665908813477, "rewards/margins": 0.3278936445713043, "rewards/rejected": -1.6473604440689087, "sft_loss": 1.3597601652145386, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 7.684311641797868, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.35185542702674866, "logits/rejected": -0.07574127614498138, "logps/chosen": -1.3232667446136475, "logps/rejected": -1.6644926071166992, "loss": 1.0237, "rewards/accuracies": 0.625, "rewards/chosen": -1.3232667446136475, "rewards/margins": 0.34122568368911743, "rewards/rejected": -1.6644926071166992, "sft_loss": 1.3519316911697388, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 7.797432185476643, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.15763789415359497, "logits/rejected": -0.06480036675930023, "logps/chosen": -1.3108044862747192, "logps/rejected": -1.6585772037506104, "loss": 1.0056, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3108044862747192, "rewards/margins": 0.3477727770805359, "rewards/rejected": -1.6585772037506104, "sft_loss": 1.279354214668274, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 7.745216098418087, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.29884886741638184, "logits/rejected": -0.1565883904695511, "logps/chosen": -1.283048152923584, "logps/rejected": -1.7354202270507812, "loss": 1.002, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.283048152923584, "rewards/margins": 0.45237183570861816, "rewards/rejected": -1.7354202270507812, "sft_loss": 1.3008395433425903, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 10.78515921532401, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.17739878594875336, "logits/rejected": -0.08526907861232758, "logps/chosen": -1.3533596992492676, "logps/rejected": -1.7326189279556274, "loss": 1.0338, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3533596992492676, "rewards/margins": 0.3792593479156494, "rewards/rejected": -1.7326189279556274, "sft_loss": 1.3796982765197754, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 8.176774721936933, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.1857794225215912, "logits/rejected": -0.11552359908819199, "logps/chosen": -1.297877550125122, "logps/rejected": -1.8466908931732178, "loss": 1.0068, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.297877550125122, "rewards/margins": 0.5488134622573853, "rewards/rejected": -1.8466908931732178, "sft_loss": 1.3502676486968994, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 9.824605979924113, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.2430846244096756, "logits/rejected": -0.09735213220119476, "logps/chosen": -1.3140398263931274, "logps/rejected": -1.7592847347259521, "loss": 0.9866, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3140398263931274, "rewards/margins": 0.44524508714675903, "rewards/rejected": -1.7592847347259521, "sft_loss": 1.3423892259597778, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 6.999487909190867, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.11612536013126373, "logits/rejected": 0.008373789489269257, "logps/chosen": -1.3217031955718994, "logps/rejected": -1.7657520771026611, "loss": 1.0348, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3217031955718994, "rewards/margins": 0.44404906034469604, "rewards/rejected": -1.7657520771026611, "sft_loss": 1.343665361404419, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 6.980014810377578, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.21279378235340118, "logits/rejected": -0.15889443457126617, "logps/chosen": -1.3280820846557617, "logps/rejected": -1.8464562892913818, "loss": 0.9733, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3280820846557617, "rewards/margins": 0.5183740854263306, "rewards/rejected": -1.8464562892913818, "sft_loss": 1.3642082214355469, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 6.949960470988114, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.10336363315582275, "logits/rejected": -0.05016554147005081, "logps/chosen": -1.4357208013534546, "logps/rejected": -1.8436673879623413, "loss": 1.0583, "rewards/accuracies": 0.625, "rewards/chosen": -1.4357208013534546, "rewards/margins": 0.4079464077949524, "rewards/rejected": -1.8436673879623413, "sft_loss": 1.4935412406921387, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 9.413612625444546, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.15116353332996368, "logits/rejected": -0.07496649026870728, "logps/chosen": -1.3940389156341553, "logps/rejected": -1.6964954137802124, "loss": 1.0913, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3940389156341553, "rewards/margins": 0.30245649814605713, "rewards/rejected": -1.6964954137802124, "sft_loss": 1.4201014041900635, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 7.514835548491341, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.29490602016448975, "logits/rejected": -0.08670699596405029, "logps/chosen": -1.3553574085235596, "logps/rejected": -1.8344428539276123, "loss": 0.9764, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3553574085235596, "rewards/margins": 0.4790855348110199, "rewards/rejected": -1.8344428539276123, "sft_loss": 1.3091545104980469, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 9.723844697548177, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.13972511887550354, "logits/rejected": 0.04370427876710892, "logps/chosen": -1.3503167629241943, "logps/rejected": -1.9329227209091187, "loss": 1.0023, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3503167629241943, "rewards/margins": 0.5826060175895691, "rewards/rejected": -1.9329227209091187, "sft_loss": 1.373500108718872, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.13220730423927307, "eval_logits/rejected": 0.22114311158657074, "eval_logps/chosen": -1.376747727394104, "eval_logps/rejected": -1.7830473184585571, "eval_loss": 1.0404950380325317, "eval_rewards/accuracies": 0.612017810344696, "eval_rewards/chosen": -1.376747727394104, "eval_rewards/margins": 0.4062995910644531, "eval_rewards/rejected": -1.7830473184585571, "eval_runtime": 43.3596, "eval_samples_per_second": 31.02, "eval_sft_loss": 1.3926547765731812, "eval_steps_per_second": 7.772, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 8.133124681161464, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.25169333815574646, "logits/rejected": -0.20886509120464325, "logps/chosen": -1.3518650531768799, "logps/rejected": -1.6274452209472656, "loss": 1.0831, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3518650531768799, "rewards/margins": 0.2755802273750305, "rewards/rejected": -1.6274452209472656, "sft_loss": 1.380122184753418, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 7.469153856364965, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.17526309192180634, "logits/rejected": -0.051106780767440796, "logps/chosen": -1.313644289970398, "logps/rejected": -1.7661473751068115, "loss": 1.0174, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.313644289970398, "rewards/margins": 0.4525030255317688, "rewards/rejected": -1.7661473751068115, "sft_loss": 1.39413583278656, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 6.523078696838851, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.10688170045614243, "logits/rejected": 0.013090262189507484, "logps/chosen": -1.3751380443572998, "logps/rejected": -1.7437824010849, "loss": 1.046, "rewards/accuracies": 0.625, "rewards/chosen": -1.3751380443572998, "rewards/margins": 0.36864447593688965, "rewards/rejected": -1.7437824010849, "sft_loss": 1.3935257196426392, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 12.122483026445424, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.1743597835302353, "logits/rejected": -0.10058524459600449, "logps/chosen": -1.3639273643493652, "logps/rejected": -1.6401739120483398, "loss": 1.0758, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3639273643493652, "rewards/margins": 0.276246577501297, "rewards/rejected": -1.6401739120483398, "sft_loss": 1.4094431400299072, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 8.402271918311031, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.12908560037612915, "logits/rejected": -0.10964863002300262, "logps/chosen": -1.3172557353973389, "logps/rejected": -1.6691465377807617, "loss": 1.0411, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3172557353973389, "rewards/margins": 0.3518907427787781, "rewards/rejected": -1.6691465377807617, "sft_loss": 1.3863766193389893, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 15.309824266627936, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.22412028908729553, "logits/rejected": -0.1645040512084961, "logps/chosen": -1.3662104606628418, "logps/rejected": -1.7294059991836548, "loss": 1.0994, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3662104606628418, "rewards/margins": 0.3631953299045563, "rewards/rejected": -1.7294059991836548, "sft_loss": 1.470768690109253, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 7.523094119971832, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.23497335612773895, "logits/rejected": -0.14777755737304688, "logps/chosen": -1.3361382484436035, "logps/rejected": -1.7206249237060547, "loss": 1.0388, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3361382484436035, "rewards/margins": 0.3844866156578064, "rewards/rejected": -1.7206249237060547, "sft_loss": 1.4018043279647827, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 6.650681430076024, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.1847231686115265, "logits/rejected": -0.0686403438448906, "logps/chosen": -1.4241373538970947, "logps/rejected": -1.6333825588226318, "loss": 1.0985, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4241373538970947, "rewards/margins": 0.20924527943134308, "rewards/rejected": -1.6333825588226318, "sft_loss": 1.4570655822753906, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 6.703996052459396, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.24172177910804749, "logits/rejected": -0.11361400038003922, "logps/chosen": -1.4382174015045166, "logps/rejected": -1.6817821264266968, "loss": 1.0878, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4382174015045166, "rewards/margins": 0.24356460571289062, "rewards/rejected": -1.6817821264266968, "sft_loss": 1.47786545753479, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 6.809062902834572, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.2671525776386261, "logits/rejected": -0.16197296977043152, "logps/chosen": -1.316367268562317, "logps/rejected": -1.6348955631256104, "loss": 1.035, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.316367268562317, "rewards/margins": 0.31852835416793823, "rewards/rejected": -1.6348955631256104, "sft_loss": 1.3881739377975464, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 8.59906216379134, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.2066754400730133, "logits/rejected": -0.06717734038829803, "logps/chosen": -1.4142173528671265, "logps/rejected": -1.7292635440826416, "loss": 1.0753, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4142173528671265, "rewards/margins": 0.31504613161087036, "rewards/rejected": -1.7292635440826416, "sft_loss": 1.4033355712890625, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 9.493755359677273, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.23579907417297363, "logits/rejected": -0.10296180099248886, "logps/chosen": -1.2681344747543335, "logps/rejected": -1.6588176488876343, "loss": 1.0213, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2681344747543335, "rewards/margins": 0.39068326354026794, "rewards/rejected": -1.6588176488876343, "sft_loss": 1.3464233875274658, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 7.389145463944783, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.17442061007022858, "logits/rejected": -0.05897391960024834, "logps/chosen": -1.272640585899353, "logps/rejected": -1.662667989730835, "loss": 1.0202, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.272640585899353, "rewards/margins": 0.39002761244773865, "rewards/rejected": -1.662667989730835, "sft_loss": 1.2678601741790771, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 6.70872777057386, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.23170146346092224, "logits/rejected": -0.08080779761075974, "logps/chosen": -1.3061374425888062, "logps/rejected": -1.7743968963623047, "loss": 1.0124, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3061374425888062, "rewards/margins": 0.46825942397117615, "rewards/rejected": -1.7743968963623047, "sft_loss": 1.4011518955230713, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 5.941168181963396, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.21781177818775177, "logits/rejected": -0.07623197138309479, "logps/chosen": -1.3767824172973633, "logps/rejected": -1.8881441354751587, "loss": 1.0171, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3767824172973633, "rewards/margins": 0.5113617181777954, "rewards/rejected": -1.8881441354751587, "sft_loss": 1.4345794916152954, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 7.172600001585544, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.21801035106182098, "logits/rejected": -0.094429150223732, "logps/chosen": -1.2056726217269897, "logps/rejected": -1.6035264730453491, "loss": 0.9565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2056726217269897, "rewards/margins": 0.39785367250442505, "rewards/rejected": -1.6035264730453491, "sft_loss": 1.2749571800231934, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 8.996171174673789, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.227299302816391, "logits/rejected": -0.14037606120109558, "logps/chosen": -1.2919366359710693, "logps/rejected": -1.7213513851165771, "loss": 1.0273, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2919366359710693, "rewards/margins": 0.4294148087501526, "rewards/rejected": -1.7213513851165771, "sft_loss": 1.366469383239746, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 6.584738685805881, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.2102833241224289, "logits/rejected": -0.0533757284283638, "logps/chosen": -1.3438446521759033, "logps/rejected": -1.7797075510025024, "loss": 1.018, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3438446521759033, "rewards/margins": 0.43586283922195435, "rewards/rejected": -1.7797075510025024, "sft_loss": 1.3491125106811523, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 8.115150453314252, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.1443311721086502, "logits/rejected": -0.09654446691274643, "logps/chosen": -1.3931901454925537, "logps/rejected": -1.8128507137298584, "loss": 1.0565, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3931901454925537, "rewards/margins": 0.4196605086326599, "rewards/rejected": -1.8128507137298584, "sft_loss": 1.4648798704147339, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 8.432376252624964, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.14914622902870178, "logits/rejected": -0.04011151194572449, "logps/chosen": -1.3027012348175049, "logps/rejected": -1.7023481130599976, "loss": 1.0173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3027012348175049, "rewards/margins": 0.3996468484401703, "rewards/rejected": -1.7023481130599976, "sft_loss": 1.332310438156128, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 7.581891370204553, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.2142951786518097, "logits/rejected": -0.1357312798500061, "logps/chosen": -1.3414753675460815, "logps/rejected": -1.75808846950531, "loss": 1.0063, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3414753675460815, "rewards/margins": 0.4166131019592285, "rewards/rejected": -1.75808846950531, "sft_loss": 1.344266414642334, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 8.87844232756679, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.06727816164493561, "logits/rejected": -0.01013193279504776, "logps/chosen": -1.3536124229431152, "logps/rejected": -1.688381552696228, "loss": 1.0899, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3536124229431152, "rewards/margins": 0.3347693085670471, "rewards/rejected": -1.688381552696228, "sft_loss": 1.4273655414581299, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 6.813440382150874, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.1517767459154129, "logits/rejected": -0.07338926941156387, "logps/chosen": -1.3509794473648071, "logps/rejected": -1.7539699077606201, "loss": 1.0538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3509794473648071, "rewards/margins": 0.4029907286167145, "rewards/rejected": -1.7539699077606201, "sft_loss": 1.3848661184310913, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 4.664992051222814, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.19024668633937836, "logits/rejected": -0.10702836513519287, "logps/chosen": -1.3410247564315796, "logps/rejected": -1.6328284740447998, "loss": 1.0458, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3410247564315796, "rewards/margins": 0.2918040156364441, "rewards/rejected": -1.6328284740447998, "sft_loss": 1.3327479362487793, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 7.138454236405234, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.24577781558036804, "logits/rejected": -0.13845011591911316, "logps/chosen": -1.3480573892593384, "logps/rejected": -1.739193320274353, "loss": 1.0602, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3480573892593384, "rewards/margins": 0.3911357820034027, "rewards/rejected": -1.739193320274353, "sft_loss": 1.4003136157989502, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 5.283253532535059, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.1491527259349823, "logits/rejected": -0.01746211014688015, "logps/chosen": -1.2778774499893188, "logps/rejected": -1.7071552276611328, "loss": 0.9776, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2778774499893188, "rewards/margins": 0.42927759885787964, "rewards/rejected": -1.7071552276611328, "sft_loss": 1.2987234592437744, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 12.054790115213882, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.11329762637615204, "logits/rejected": -0.03464198112487793, "logps/chosen": -1.4374887943267822, "logps/rejected": -1.6887142658233643, "loss": 1.1489, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4374887943267822, "rewards/margins": 0.2512255609035492, "rewards/rejected": -1.6887142658233643, "sft_loss": 1.4471741914749146, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 7.143914332553867, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.12631377577781677, "logits/rejected": -0.07701162248849869, "logps/chosen": -1.3984800577163696, "logps/rejected": -1.6868093013763428, "loss": 1.0818, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3984800577163696, "rewards/margins": 0.28832948207855225, "rewards/rejected": -1.6868093013763428, "sft_loss": 1.379875659942627, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 7.623520104046133, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.17279155552387238, "logits/rejected": -0.06797249615192413, "logps/chosen": -1.3755438327789307, "logps/rejected": -1.7344719171524048, "loss": 1.0419, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3755438327789307, "rewards/margins": 0.3589281439781189, "rewards/rejected": -1.7344719171524048, "sft_loss": 1.3938535451889038, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 11.226955224513722, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.23111264407634735, "logits/rejected": -0.14523088932037354, "logps/chosen": -1.250886082649231, "logps/rejected": -1.725956678390503, "loss": 0.9775, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.250886082649231, "rewards/margins": 0.47507089376449585, "rewards/rejected": -1.725956678390503, "sft_loss": 1.287925124168396, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 6.836683127732292, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.19628411531448364, "logits/rejected": -0.09107625484466553, "logps/chosen": -1.2780523300170898, "logps/rejected": -1.8557761907577515, "loss": 0.9756, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2780523300170898, "rewards/margins": 0.5777239203453064, "rewards/rejected": -1.8557761907577515, "sft_loss": 1.3645378351211548, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 6.996028018688775, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.2888035476207733, "logits/rejected": -0.2642049789428711, "logps/chosen": -1.3084895610809326, "logps/rejected": -1.8495372533798218, "loss": 1.0097, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3084895610809326, "rewards/margins": 0.5410477519035339, "rewards/rejected": -1.8495372533798218, "sft_loss": 1.3794362545013428, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 9.926606596207181, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.1349642425775528, "logits/rejected": 0.04218194633722305, "logps/chosen": -1.3362480401992798, "logps/rejected": -1.839485764503479, "loss": 0.999, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3362480401992798, "rewards/margins": 0.503237783908844, "rewards/rejected": -1.839485764503479, "sft_loss": 1.3468215465545654, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 14.972325401010377, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.2785857617855072, "logits/rejected": -0.11333181709051132, "logps/chosen": -1.274951696395874, "logps/rejected": -1.9533205032348633, "loss": 0.9451, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.274951696395874, "rewards/margins": 0.6783686876296997, "rewards/rejected": -1.9533205032348633, "sft_loss": 1.319342851638794, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 8.705233648780364, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.23137220740318298, "logits/rejected": -0.17029942572116852, "logps/chosen": -1.3322160243988037, "logps/rejected": -1.7487624883651733, "loss": 1.025, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3322160243988037, "rewards/margins": 0.4165467321872711, "rewards/rejected": -1.7487624883651733, "sft_loss": 1.3671302795410156, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 7.650898256780712, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.160761758685112, "logits/rejected": -0.12005305290222168, "logps/chosen": -1.323396921157837, "logps/rejected": -1.7633053064346313, "loss": 1.0471, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.323396921157837, "rewards/margins": 0.43990859389305115, "rewards/rejected": -1.7633053064346313, "sft_loss": 1.4031734466552734, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 6.0976862564963, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.21818462014198303, "logits/rejected": -0.09445396065711975, "logps/chosen": -1.3529255390167236, "logps/rejected": -1.9128081798553467, "loss": 0.9711, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3529255390167236, "rewards/margins": 0.559882640838623, "rewards/rejected": -1.9128081798553467, "sft_loss": 1.3481228351593018, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 5.441754368319895, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.2935495674610138, "logits/rejected": -0.11881496757268906, "logps/chosen": -1.305868148803711, "logps/rejected": -1.8669077157974243, "loss": 0.9641, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.305868148803711, "rewards/margins": 0.5610396862030029, "rewards/rejected": -1.8669077157974243, "sft_loss": 1.3788195848464966, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 9.345893259790794, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.18723037838935852, "logits/rejected": -0.15769711136817932, "logps/chosen": -1.2819069623947144, "logps/rejected": -1.6533581018447876, "loss": 1.0239, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2819069623947144, "rewards/margins": 0.37145131826400757, "rewards/rejected": -1.6533581018447876, "sft_loss": 1.3619247674942017, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 11.061799198438894, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.1916731595993042, "logits/rejected": -0.059759993106126785, "logps/chosen": -1.3480558395385742, "logps/rejected": -1.8577535152435303, "loss": 1.0046, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3480558395385742, "rewards/margins": 0.509697675704956, "rewards/rejected": -1.8577535152435303, "sft_loss": 1.427194356918335, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 9.434706841711222, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.1767081767320633, "logits/rejected": -0.059535883367061615, "logps/chosen": -1.2606967687606812, "logps/rejected": -1.712166428565979, "loss": 0.9757, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2606967687606812, "rewards/margins": 0.4514695107936859, "rewards/rejected": -1.712166428565979, "sft_loss": 1.308203101158142, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 8.219395893248242, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.07792104035615921, "logits/rejected": 0.010812275111675262, "logps/chosen": -1.2784520387649536, "logps/rejected": -1.8368396759033203, "loss": 0.9606, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2784520387649536, "rewards/margins": 0.558387815952301, "rewards/rejected": -1.8368396759033203, "sft_loss": 1.296967625617981, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 7.428218502025483, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.30743715167045593, "logits/rejected": -0.1746453046798706, "logps/chosen": -1.3353490829467773, "logps/rejected": -1.719951868057251, "loss": 1.0081, "rewards/accuracies": 0.625, "rewards/chosen": -1.3353490829467773, "rewards/margins": 0.3846026659011841, "rewards/rejected": -1.719951868057251, "sft_loss": 1.3209892511367798, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 10.29560194460434, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.281122624874115, "logits/rejected": -0.08383277803659439, "logps/chosen": -1.399206519126892, "logps/rejected": -1.8089908361434937, "loss": 1.0588, "rewards/accuracies": 0.59375, "rewards/chosen": -1.399206519126892, "rewards/margins": 0.40978437662124634, "rewards/rejected": -1.8089908361434937, "sft_loss": 1.4134975671768188, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 7.948370711282376, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.23702120780944824, "logits/rejected": -0.12606510519981384, "logps/chosen": -1.2770016193389893, "logps/rejected": -1.7505557537078857, "loss": 0.971, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2770016193389893, "rewards/margins": 0.47355398535728455, "rewards/rejected": -1.7505557537078857, "sft_loss": 1.3434475660324097, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 7.104542934372534, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.31596413254737854, "logits/rejected": -0.1691267192363739, "logps/chosen": -1.3756964206695557, "logps/rejected": -1.9007256031036377, "loss": 0.9993, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3756964206695557, "rewards/margins": 0.5250293016433716, "rewards/rejected": -1.9007256031036377, "sft_loss": 1.389954924583435, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 9.283822610907997, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.189658060669899, "logits/rejected": -0.0372190997004509, "logps/chosen": -1.3490064144134521, "logps/rejected": -1.820847511291504, "loss": 0.9957, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3490064144134521, "rewards/margins": 0.47184085845947266, "rewards/rejected": -1.820847511291504, "sft_loss": 1.3788495063781738, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 7.308347252793988, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.16798244416713715, "logits/rejected": -0.0625995546579361, "logps/chosen": -1.2780601978302002, "logps/rejected": -1.8349239826202393, "loss": 0.9607, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2780601978302002, "rewards/margins": 0.5568638443946838, "rewards/rejected": -1.8349239826202393, "sft_loss": 1.3138271570205688, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 5.961133659424594, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.3501536250114441, "logits/rejected": -0.10656338930130005, "logps/chosen": -1.3864253759384155, "logps/rejected": -1.8850123882293701, "loss": 0.9978, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3864253759384155, "rewards/margins": 0.4985869824886322, "rewards/rejected": -1.8850123882293701, "sft_loss": 1.4140044450759888, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 8.40981218131488, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.24091164767742157, "logits/rejected": -0.07308875769376755, "logps/chosen": -1.3868906497955322, "logps/rejected": -1.8397271633148193, "loss": 1.0406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3868906497955322, "rewards/margins": 0.4528365731239319, "rewards/rejected": -1.8397271633148193, "sft_loss": 1.4096046686172485, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 12.477651068225477, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.17689739167690277, "logits/rejected": 0.006243367679417133, "logps/chosen": -1.3302204608917236, "logps/rejected": -2.0581300258636475, "loss": 0.932, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3302204608917236, "rewards/margins": 0.7279095649719238, "rewards/rejected": -2.0581300258636475, "sft_loss": 1.333121657371521, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 6.455403879486427, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.20638510584831238, "logits/rejected": -0.042838867753744125, "logps/chosen": -1.3307113647460938, "logps/rejected": -1.9355605840682983, "loss": 0.9769, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3307113647460938, "rewards/margins": 0.6048492193222046, "rewards/rejected": -1.9355605840682983, "sft_loss": 1.3224481344223022, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 9.636876818748828, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.32187479734420776, "logits/rejected": -0.1283341944217682, "logps/chosen": -1.4119688272476196, "logps/rejected": -1.7951968908309937, "loss": 1.0789, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4119688272476196, "rewards/margins": 0.38322800397872925, "rewards/rejected": -1.7951968908309937, "sft_loss": 1.4519226551055908, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 10.56360714977184, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.2144956886768341, "logits/rejected": -0.1632927805185318, "logps/chosen": -1.2403037548065186, "logps/rejected": -1.7169328927993774, "loss": 0.9702, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2403037548065186, "rewards/margins": 0.4766291677951813, "rewards/rejected": -1.7169328927993774, "sft_loss": 1.270911693572998, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 9.584980104505666, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.2386658638715744, "logits/rejected": -0.12415851652622223, "logps/chosen": -1.340667963027954, "logps/rejected": -1.8312772512435913, "loss": 0.979, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.340667963027954, "rewards/margins": 0.49060922861099243, "rewards/rejected": -1.8312772512435913, "sft_loss": 1.4031498432159424, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 13.524396726620859, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.23326320946216583, "logits/rejected": -0.06280764937400818, "logps/chosen": -1.221289873123169, "logps/rejected": -1.7749824523925781, "loss": 0.9317, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.221289873123169, "rewards/margins": 0.5536924600601196, "rewards/rejected": -1.7749824523925781, "sft_loss": 1.2569924592971802, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 9.48373908089574, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.2690264582633972, "logits/rejected": -0.11539246886968613, "logps/chosen": -1.268319845199585, "logps/rejected": -1.8795678615570068, "loss": 0.9535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.268319845199585, "rewards/margins": 0.6112480163574219, "rewards/rejected": -1.8795678615570068, "sft_loss": 1.3194853067398071, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 7.272403165121372, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.16457560658454895, "logits/rejected": -0.06957674771547318, "logps/chosen": -1.3222824335098267, "logps/rejected": -1.7769525051116943, "loss": 1.0518, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3222824335098267, "rewards/margins": 0.4546701908111572, "rewards/rejected": -1.7769525051116943, "sft_loss": 1.375428557395935, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 6.596628800990976, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.24087989330291748, "logits/rejected": -0.0020778581965714693, "logps/chosen": -1.288952112197876, "logps/rejected": -1.8099241256713867, "loss": 0.9846, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.288952112197876, "rewards/margins": 0.5209718942642212, "rewards/rejected": -1.8099241256713867, "sft_loss": 1.3294023275375366, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 12.309650576815347, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.06875167042016983, "logits/rejected": -0.052970387041568756, "logps/chosen": -1.3002475500106812, "logps/rejected": -1.7216488122940063, "loss": 1.0046, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3002475500106812, "rewards/margins": 0.42140135169029236, "rewards/rejected": -1.7216488122940063, "sft_loss": 1.35751473903656, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 7.051202750690179, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.15310558676719666, "logits/rejected": -0.05742264911532402, "logps/chosen": -1.3195465803146362, "logps/rejected": -1.7413108348846436, "loss": 1.0062, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3195465803146362, "rewards/margins": 0.4217642843723297, "rewards/rejected": -1.7413108348846436, "sft_loss": 1.3092045783996582, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 8.940324337005347, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.1288268119096756, "logits/rejected": -0.006907902657985687, "logps/chosen": -1.2185369729995728, "logps/rejected": -1.6721731424331665, "loss": 0.9723, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2185369729995728, "rewards/margins": 0.4536362290382385, "rewards/rejected": -1.6721731424331665, "sft_loss": 1.2611117362976074, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 7.482529780113386, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.22753730416297913, "logits/rejected": -0.06723004579544067, "logps/chosen": -1.3481438159942627, "logps/rejected": -1.8852345943450928, "loss": 0.9939, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3481438159942627, "rewards/margins": 0.5370906591415405, "rewards/rejected": -1.8852345943450928, "sft_loss": 1.39449143409729, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 13.492128161318169, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.21619009971618652, "logits/rejected": -0.05382479354739189, "logps/chosen": -1.3529709577560425, "logps/rejected": -2.009631633758545, "loss": 0.9675, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3529709577560425, "rewards/margins": 0.6566608548164368, "rewards/rejected": -2.009631633758545, "sft_loss": 1.3439533710479736, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 6.67567810727143, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.20344305038452148, "logits/rejected": -0.08249272406101227, "logps/chosen": -1.27012300491333, "logps/rejected": -1.800536870956421, "loss": 0.9979, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.27012300491333, "rewards/margins": 0.5304139852523804, "rewards/rejected": -1.800536870956421, "sft_loss": 1.3256646394729614, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 5.538112523667141, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.28186747431755066, "logits/rejected": -0.1351318061351776, "logps/chosen": -1.3375747203826904, "logps/rejected": -1.8085401058197021, "loss": 0.9924, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3375747203826904, "rewards/margins": 0.4709654748439789, "rewards/rejected": -1.8085401058197021, "sft_loss": 1.341721773147583, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 6.646817354380201, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.1404200941324234, "logits/rejected": -0.07894755899906158, "logps/chosen": -1.3222754001617432, "logps/rejected": -1.8376655578613281, "loss": 0.9938, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3222754001617432, "rewards/margins": 0.5153903961181641, "rewards/rejected": -1.8376655578613281, "sft_loss": 1.3083678483963013, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 10.23679057616976, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.17988917231559753, "logits/rejected": -0.12885865569114685, "logps/chosen": -1.2356412410736084, "logps/rejected": -1.8562723398208618, "loss": 0.9153, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2356412410736084, "rewards/margins": 0.6206308603286743, "rewards/rejected": -1.8562723398208618, "sft_loss": 1.2337496280670166, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 6.8928666859689045, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.19816677272319794, "logits/rejected": -0.0217067189514637, "logps/chosen": -1.3648486137390137, "logps/rejected": -1.738193154335022, "loss": 1.0507, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3648486137390137, "rewards/margins": 0.3733447790145874, "rewards/rejected": -1.738193154335022, "sft_loss": 1.3976587057113647, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 9.006350243530518, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.11615003645420074, "logits/rejected": 0.044729799032211304, "logps/chosen": -1.3059442043304443, "logps/rejected": -1.7530139684677124, "loss": 1.0125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3059442043304443, "rewards/margins": 0.4470697343349457, "rewards/rejected": -1.7530139684677124, "sft_loss": 1.3343216180801392, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 6.81240384354102, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.2713935077190399, "logits/rejected": -0.18466126918792725, "logps/chosen": -1.2866567373275757, "logps/rejected": -1.8285629749298096, "loss": 0.994, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2866567373275757, "rewards/margins": 0.5419060587882996, "rewards/rejected": -1.8285629749298096, "sft_loss": 1.3387314081192017, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 12.535037949453782, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.16105331480503082, "logits/rejected": -0.04563795030117035, "logps/chosen": -1.3320871591567993, "logps/rejected": -2.02455472946167, "loss": 0.9829, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3320871591567993, "rewards/margins": 0.6924676895141602, "rewards/rejected": -2.02455472946167, "sft_loss": 1.413009762763977, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 6.675328687431673, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.11590099334716797, "logits/rejected": -0.0057983072474598885, "logps/chosen": -1.2462270259857178, "logps/rejected": -1.8627185821533203, "loss": 0.9516, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2462270259857178, "rewards/margins": 0.6164913773536682, "rewards/rejected": -1.8627185821533203, "sft_loss": 1.319265604019165, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 6.603486037645406, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.1910642683506012, "logits/rejected": -0.09453106671571732, "logps/chosen": -1.2924778461456299, "logps/rejected": -1.8789180517196655, "loss": 0.9891, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2924778461456299, "rewards/margins": 0.5864402055740356, "rewards/rejected": -1.8789180517196655, "sft_loss": 1.3684488534927368, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 7.756837511314091, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.13991785049438477, "logits/rejected": -0.03918793797492981, "logps/chosen": -1.211921215057373, "logps/rejected": -1.6410366296768188, "loss": 0.9586, "rewards/accuracies": 0.65625, "rewards/chosen": -1.211921215057373, "rewards/margins": 0.4291153848171234, "rewards/rejected": -1.6410366296768188, "sft_loss": 1.2346084117889404, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 7.276228405268265, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.2536202073097229, "logits/rejected": -0.1750868409872055, "logps/chosen": -1.2155635356903076, "logps/rejected": -1.861124038696289, "loss": 0.9508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2155635356903076, "rewards/margins": 0.6455605626106262, "rewards/rejected": -1.861124038696289, "sft_loss": 1.329552412033081, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 14.883539668534967, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.2990095019340515, "logits/rejected": -0.19773916900157928, "logps/chosen": -1.3071012496948242, "logps/rejected": -1.9256079196929932, "loss": 0.944, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3071012496948242, "rewards/margins": 0.6185065507888794, "rewards/rejected": -1.9256079196929932, "sft_loss": 1.3233211040496826, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 9.44597333332424, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.29291418194770813, "logits/rejected": -0.15362228453159332, "logps/chosen": -1.2437551021575928, "logps/rejected": -1.8007538318634033, "loss": 0.9541, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2437551021575928, "rewards/margins": 0.556998610496521, "rewards/rejected": -1.8007538318634033, "sft_loss": 1.307832956314087, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 16.441148444934594, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.3222687542438507, "logits/rejected": -0.061028677970170975, "logps/chosen": -1.3114013671875, "logps/rejected": -1.8884906768798828, "loss": 0.9779, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3114013671875, "rewards/margins": 0.5770891308784485, "rewards/rejected": -1.8884906768798828, "sft_loss": 1.3354787826538086, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 16.965841060039175, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.20985686779022217, "logits/rejected": -0.10149633884429932, "logps/chosen": -1.2924827337265015, "logps/rejected": -1.9266865253448486, "loss": 0.9654, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2924827337265015, "rewards/margins": 0.6342039108276367, "rewards/rejected": -1.9266865253448486, "sft_loss": 1.3712036609649658, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.1482262760400772, "eval_logits/rejected": 0.240483820438385, "eval_logps/chosen": -1.3910187482833862, "eval_logps/rejected": -1.8183025121688843, "eval_loss": 1.0418007373809814, "eval_rewards/accuracies": 0.6179525256156921, "eval_rewards/chosen": -1.3910187482833862, "eval_rewards/margins": 0.4272836744785309, "eval_rewards/rejected": -1.8183025121688843, "eval_runtime": 46.2356, "eval_samples_per_second": 29.09, "eval_sft_loss": 1.3967313766479492, "eval_steps_per_second": 7.289, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 9.133500661368627, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.19903546571731567, "logits/rejected": 0.02608914114534855, "logps/chosen": -1.4473344087600708, "logps/rejected": -1.8965390920639038, "loss": 1.0441, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4473344087600708, "rewards/margins": 0.44920483231544495, "rewards/rejected": -1.8965390920639038, "sft_loss": 1.4248669147491455, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 9.074100048107182, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.23106679320335388, "logits/rejected": -0.13114657998085022, "logps/chosen": -1.334062933921814, "logps/rejected": -1.7841312885284424, "loss": 1.0196, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.334062933921814, "rewards/margins": 0.45006832480430603, "rewards/rejected": -1.7841312885284424, "sft_loss": 1.3713685274124146, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 9.618864174062091, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.21874377131462097, "logits/rejected": 0.03923650458455086, "logps/chosen": -1.3294260501861572, "logps/rejected": -1.780279517173767, "loss": 1.026, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3294260501861572, "rewards/margins": 0.45085349678993225, "rewards/rejected": -1.780279517173767, "sft_loss": 1.3640459775924683, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 13.560338534464554, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.29200059175491333, "logits/rejected": -0.1397140622138977, "logps/chosen": -1.364084243774414, "logps/rejected": -1.825477957725525, "loss": 1.0258, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.364084243774414, "rewards/margins": 0.4613935351371765, "rewards/rejected": -1.825477957725525, "sft_loss": 1.4269052743911743, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 7.098165162790431, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.21481871604919434, "logits/rejected": -0.10389243066310883, "logps/chosen": -1.286023736000061, "logps/rejected": -1.780125617980957, "loss": 0.992, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.286023736000061, "rewards/margins": 0.49410194158554077, "rewards/rejected": -1.780125617980957, "sft_loss": 1.304584264755249, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 11.026115794573862, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.25596949458122253, "logits/rejected": -0.17235644161701202, "logps/chosen": -1.2982758283615112, "logps/rejected": -1.8246577978134155, "loss": 0.9829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2982758283615112, "rewards/margins": 0.5263819694519043, "rewards/rejected": -1.8246577978134155, "sft_loss": 1.3357901573181152, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 9.030464811943805, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.2392152100801468, "logits/rejected": -0.20006027817726135, "logps/chosen": -1.260817289352417, "logps/rejected": -1.5614498853683472, "loss": 1.0272, "rewards/accuracies": 0.65625, "rewards/chosen": -1.260817289352417, "rewards/margins": 0.30063262581825256, "rewards/rejected": -1.5614498853683472, "sft_loss": 1.3343435525894165, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 5.569795891024788, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.22639043629169464, "logits/rejected": -0.18254801630973816, "logps/chosen": -1.242897391319275, "logps/rejected": -1.709593415260315, "loss": 0.9835, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.242897391319275, "rewards/margins": 0.46669578552246094, "rewards/rejected": -1.709593415260315, "sft_loss": 1.2660150527954102, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 6.791702368644484, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.18503087759017944, "logits/rejected": -0.05424821376800537, "logps/chosen": -1.358148455619812, "logps/rejected": -1.8204864263534546, "loss": 1.0243, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.358148455619812, "rewards/margins": 0.4623379111289978, "rewards/rejected": -1.8204864263534546, "sft_loss": 1.4118998050689697, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 8.644936297670235, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.26354607939720154, "logits/rejected": -0.15164721012115479, "logps/chosen": -1.3595654964447021, "logps/rejected": -1.7514352798461914, "loss": 1.0539, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3595654964447021, "rewards/margins": 0.39186978340148926, "rewards/rejected": -1.7514352798461914, "sft_loss": 1.4033119678497314, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 6.899153154126547, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.1441766768693924, "logits/rejected": -0.08697351068258286, "logps/chosen": -1.338050365447998, "logps/rejected": -1.6296417713165283, "loss": 1.0226, "rewards/accuracies": 0.65625, "rewards/chosen": -1.338050365447998, "rewards/margins": 0.2915913760662079, "rewards/rejected": -1.6296417713165283, "sft_loss": 1.3598458766937256, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 21.304530755120677, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.06472693383693695, "logits/rejected": 0.04358845204114914, "logps/chosen": -1.2369481325149536, "logps/rejected": -1.7288429737091064, "loss": 0.9767, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2369481325149536, "rewards/margins": 0.4918946623802185, "rewards/rejected": -1.7288429737091064, "sft_loss": 1.2975510358810425, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 16.284974043783276, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.2730258107185364, "logits/rejected": -0.1723470240831375, "logps/chosen": -1.3269093036651611, "logps/rejected": -1.891418695449829, "loss": 0.9966, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3269093036651611, "rewards/margins": 0.5645094513893127, "rewards/rejected": -1.891418695449829, "sft_loss": 1.3565967082977295, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 11.971155874073554, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.16100676357746124, "logits/rejected": 0.049106527119874954, "logps/chosen": -1.2725163698196411, "logps/rejected": -1.8967195749282837, "loss": 0.9728, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2725163698196411, "rewards/margins": 0.624203085899353, "rewards/rejected": -1.8967195749282837, "sft_loss": 1.3371376991271973, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 12.42484802411317, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.23265857994556427, "logits/rejected": -0.04037471115589142, "logps/chosen": -1.378442406654358, "logps/rejected": -1.7594501972198486, "loss": 1.0376, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.378442406654358, "rewards/margins": 0.38100793957710266, "rewards/rejected": -1.7594501972198486, "sft_loss": 1.4049503803253174, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 9.926837801001243, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.24227485060691833, "logits/rejected": -0.10362211614847183, "logps/chosen": -1.3642479181289673, "logps/rejected": -1.9338639974594116, "loss": 1.016, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3642479181289673, "rewards/margins": 0.5696161389350891, "rewards/rejected": -1.9338639974594116, "sft_loss": 1.4558117389678955, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 10.447830711803674, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.20619995892047882, "logits/rejected": -0.11009863764047623, "logps/chosen": -1.309762954711914, "logps/rejected": -1.675286054611206, "loss": 1.0163, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.309762954711914, "rewards/margins": 0.3655230700969696, "rewards/rejected": -1.675286054611206, "sft_loss": 1.3190863132476807, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 7.788529932811042, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.2938595116138458, "logits/rejected": -0.10211338102817535, "logps/chosen": -1.279653549194336, "logps/rejected": -1.7211673259735107, "loss": 1.0055, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.279653549194336, "rewards/margins": 0.4415138363838196, "rewards/rejected": -1.7211673259735107, "sft_loss": 1.3394492864608765, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 7.232847549769498, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.2455311268568039, "logits/rejected": -0.15797817707061768, "logps/chosen": -1.3783433437347412, "logps/rejected": -1.723474144935608, "loss": 1.058, "rewards/accuracies": 0.625, "rewards/chosen": -1.3783433437347412, "rewards/margins": 0.34513089060783386, "rewards/rejected": -1.723474144935608, "sft_loss": 1.4231681823730469, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 10.274974819074817, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.17151767015457153, "logits/rejected": -0.03050888516008854, "logps/chosen": -1.212294340133667, "logps/rejected": -1.779362678527832, "loss": 0.9717, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.212294340133667, "rewards/margins": 0.5670684576034546, "rewards/rejected": -1.779362678527832, "sft_loss": 1.330917239189148, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 10.474003261456547, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.2251317799091339, "logits/rejected": -0.031384989619255066, "logps/chosen": -1.3947486877441406, "logps/rejected": -1.8142874240875244, "loss": 1.022, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3947486877441406, "rewards/margins": 0.41953855752944946, "rewards/rejected": -1.8142874240875244, "sft_loss": 1.391324520111084, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 22.073423191846164, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.17992620170116425, "logits/rejected": -0.00832604430615902, "logps/chosen": -1.2970690727233887, "logps/rejected": -1.7878872156143188, "loss": 1.0178, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2970690727233887, "rewards/margins": 0.49081793427467346, "rewards/rejected": -1.7878872156143188, "sft_loss": 1.368438959121704, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 13.437164800427897, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.15775267779827118, "logits/rejected": -0.10851816833019257, "logps/chosen": -1.3005092144012451, "logps/rejected": -1.8442541360855103, "loss": 0.963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3005092144012451, "rewards/margins": 0.5437448024749756, "rewards/rejected": -1.8442541360855103, "sft_loss": 1.3481507301330566, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 8.324524926032959, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.204762265086174, "logits/rejected": -0.12053843587636948, "logps/chosen": -1.254342794418335, "logps/rejected": -1.835875153541565, "loss": 0.9696, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.254342794418335, "rewards/margins": 0.5815322399139404, "rewards/rejected": -1.835875153541565, "sft_loss": 1.3362596035003662, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 11.093237168255653, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.24492064118385315, "logits/rejected": -0.025628382340073586, "logps/chosen": -1.2933624982833862, "logps/rejected": -1.8775631189346313, "loss": 0.9738, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2933624982833862, "rewards/margins": 0.5842007398605347, "rewards/rejected": -1.8775631189346313, "sft_loss": 1.3566689491271973, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 9.123638617689465, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.16828450560569763, "logits/rejected": -0.08324885368347168, "logps/chosen": -1.3041160106658936, "logps/rejected": -1.8632113933563232, "loss": 0.9844, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3041160106658936, "rewards/margins": 0.5590953826904297, "rewards/rejected": -1.8632113933563232, "sft_loss": 1.351027250289917, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 9.575295181352812, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.15587007999420166, "logits/rejected": -0.16831621527671814, "logps/chosen": -1.4001448154449463, "logps/rejected": -2.0440139770507812, "loss": 1.0319, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4001448154449463, "rewards/margins": 0.6438690423965454, "rewards/rejected": -2.0440139770507812, "sft_loss": 1.446777105331421, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 9.246711856267877, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.31833615899086, "logits/rejected": -0.22816014289855957, "logps/chosen": -1.2842254638671875, "logps/rejected": -1.8089005947113037, "loss": 0.9756, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2842254638671875, "rewards/margins": 0.5246752500534058, "rewards/rejected": -1.8089005947113037, "sft_loss": 1.3310213088989258, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 9.691411327588451, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.13669563829898834, "logits/rejected": -0.0020809501875191927, "logps/chosen": -1.4141411781311035, "logps/rejected": -1.9502674341201782, "loss": 1.0033, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4141411781311035, "rewards/margins": 0.5361261367797852, "rewards/rejected": -1.9502674341201782, "sft_loss": 1.4029778242111206, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 7.939966528509989, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.1929643601179123, "logits/rejected": -0.09834329783916473, "logps/chosen": -1.2604628801345825, "logps/rejected": -1.8470321893692017, "loss": 0.9755, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2604628801345825, "rewards/margins": 0.58656907081604, "rewards/rejected": -1.8470321893692017, "sft_loss": 1.3392512798309326, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 7.6808240546636615, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.1238643079996109, "logits/rejected": -0.14029282331466675, "logps/chosen": -1.2798162698745728, "logps/rejected": -1.7341972589492798, "loss": 0.9756, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2798162698745728, "rewards/margins": 0.45438089966773987, "rewards/rejected": -1.7341972589492798, "sft_loss": 1.2994613647460938, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 11.861276253624242, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.185426265001297, "logits/rejected": -0.026468921452760696, "logps/chosen": -1.3891743421554565, "logps/rejected": -1.7652900218963623, "loss": 1.0455, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3891743421554565, "rewards/margins": 0.3761156499385834, "rewards/rejected": -1.7652900218963623, "sft_loss": 1.4331741333007812, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 12.865007757777473, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.20513662695884705, "logits/rejected": -0.08602355420589447, "logps/chosen": -1.3340986967086792, "logps/rejected": -1.872727394104004, "loss": 1.0218, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3340986967086792, "rewards/margins": 0.5386286973953247, "rewards/rejected": -1.872727394104004, "sft_loss": 1.3980815410614014, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 25.443956510077477, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.0756000429391861, "logits/rejected": -0.008861005306243896, "logps/chosen": -1.3441126346588135, "logps/rejected": -1.889350175857544, "loss": 0.9776, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3441126346588135, "rewards/margins": 0.5452374815940857, "rewards/rejected": -1.889350175857544, "sft_loss": 1.373456358909607, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 9.939323030947252, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.2153160274028778, "logits/rejected": -0.10737192630767822, "logps/chosen": -1.32200026512146, "logps/rejected": -1.6421655416488647, "loss": 1.0551, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.32200026512146, "rewards/margins": 0.32016539573669434, "rewards/rejected": -1.6421655416488647, "sft_loss": 1.335711121559143, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 9.617887992384906, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.2255008965730667, "logits/rejected": -0.0034437566064298153, "logps/chosen": -1.2645217180252075, "logps/rejected": -1.8279240131378174, "loss": 0.9514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2645217180252075, "rewards/margins": 0.5634021759033203, "rewards/rejected": -1.8279240131378174, "sft_loss": 1.313612461090088, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 7.5739248884185395, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.2612372040748596, "logits/rejected": -0.24644172191619873, "logps/chosen": -1.2250291109085083, "logps/rejected": -1.7144653797149658, "loss": 0.9706, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2250291109085083, "rewards/margins": 0.48943623900413513, "rewards/rejected": -1.7144653797149658, "sft_loss": 1.2682294845581055, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 7.365542445718876, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.12606190145015717, "logits/rejected": -0.014820749871432781, "logps/chosen": -1.2924238443374634, "logps/rejected": -1.892459511756897, "loss": 0.9807, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2924238443374634, "rewards/margins": 0.600035548210144, "rewards/rejected": -1.892459511756897, "sft_loss": 1.3498013019561768, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 9.238459788940935, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.3326072096824646, "logits/rejected": -0.18114691972732544, "logps/chosen": -1.3395802974700928, "logps/rejected": -1.7088123559951782, "loss": 1.0665, "rewards/accuracies": 0.625, "rewards/chosen": -1.3395802974700928, "rewards/margins": 0.3692319989204407, "rewards/rejected": -1.7088123559951782, "sft_loss": 1.3958898782730103, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 9.822847787054164, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.05810268968343735, "logits/rejected": -0.06424954533576965, "logps/chosen": -1.384313941001892, "logps/rejected": -1.8423607349395752, "loss": 1.0364, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.384313941001892, "rewards/margins": 0.45804667472839355, "rewards/rejected": -1.8423607349395752, "sft_loss": 1.4010447263717651, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 7.372415164748828, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.25069189071655273, "logits/rejected": -0.17634975910186768, "logps/chosen": -1.283087134361267, "logps/rejected": -1.765835165977478, "loss": 0.9576, "rewards/accuracies": 0.71875, "rewards/chosen": -1.283087134361267, "rewards/margins": 0.4827481210231781, "rewards/rejected": -1.765835165977478, "sft_loss": 1.3241028785705566, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 6.401387220245433, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.228153795003891, "logits/rejected": -0.1123252660036087, "logps/chosen": -1.287208080291748, "logps/rejected": -1.9734983444213867, "loss": 0.9658, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.287208080291748, "rewards/margins": 0.6862903833389282, "rewards/rejected": -1.9734983444213867, "sft_loss": 1.3538731336593628, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 7.858392367779872, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.2269498109817505, "logits/rejected": -0.09576071798801422, "logps/chosen": -1.3312914371490479, "logps/rejected": -1.813730001449585, "loss": 1.0197, "rewards/accuracies": 0.625, "rewards/chosen": -1.3312914371490479, "rewards/margins": 0.4824386239051819, "rewards/rejected": -1.813730001449585, "sft_loss": 1.3493551015853882, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 6.255595915276127, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.26727840304374695, "logits/rejected": -0.058417391031980515, "logps/chosen": -1.2808057069778442, "logps/rejected": -1.8030893802642822, "loss": 0.9971, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2808057069778442, "rewards/margins": 0.522283673286438, "rewards/rejected": -1.8030893802642822, "sft_loss": 1.350306749343872, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 6.500444843165474, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.25225088000297546, "logits/rejected": -0.06664416939020157, "logps/chosen": -1.3443737030029297, "logps/rejected": -1.9076101779937744, "loss": 0.9705, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3443737030029297, "rewards/margins": 0.5632363557815552, "rewards/rejected": -1.9076101779937744, "sft_loss": 1.3907678127288818, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 7.920412917789237, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.2684093713760376, "logits/rejected": -0.1828213632106781, "logps/chosen": -1.3897509574890137, "logps/rejected": -1.9323326349258423, "loss": 1.0114, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3897509574890137, "rewards/margins": 0.5425814986228943, "rewards/rejected": -1.9323326349258423, "sft_loss": 1.3893733024597168, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 5.807782159852171, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.19051989912986755, "logits/rejected": -0.06599706411361694, "logps/chosen": -1.3841588497161865, "logps/rejected": -1.6875133514404297, "loss": 1.0851, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3841588497161865, "rewards/margins": 0.3033544421195984, "rewards/rejected": -1.6875133514404297, "sft_loss": 1.4423117637634277, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 6.195477499990059, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.18834324181079865, "logits/rejected": -0.06376080214977264, "logps/chosen": -1.305316686630249, "logps/rejected": -1.6655584573745728, "loss": 1.0326, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.305316686630249, "rewards/margins": 0.36024174094200134, "rewards/rejected": -1.6655584573745728, "sft_loss": 1.385566234588623, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 14.446952301432269, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.2903379201889038, "logits/rejected": -0.16157583892345428, "logps/chosen": -1.3384888172149658, "logps/rejected": -1.7572097778320312, "loss": 1.0311, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3384888172149658, "rewards/margins": 0.4187210202217102, "rewards/rejected": -1.7572097778320312, "sft_loss": 1.3747570514678955, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 5.840044296050715, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.18125630915164948, "logits/rejected": -0.0014208063948899508, "logps/chosen": -1.3621944189071655, "logps/rejected": -1.7927730083465576, "loss": 1.0284, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3621944189071655, "rewards/margins": 0.4305785298347473, "rewards/rejected": -1.7927730083465576, "sft_loss": 1.3738056421279907, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 13.818786645322406, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.2514936625957489, "logits/rejected": -0.013041814789175987, "logps/chosen": -1.3561211824417114, "logps/rejected": -1.9976732730865479, "loss": 0.9825, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3561211824417114, "rewards/margins": 0.6415520310401917, "rewards/rejected": -1.9976732730865479, "sft_loss": 1.3830833435058594, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 14.17690399009336, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.22364842891693115, "logits/rejected": -0.1085544228553772, "logps/chosen": -1.2623711824417114, "logps/rejected": -1.7410624027252197, "loss": 0.9569, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2623711824417114, "rewards/margins": 0.47869133949279785, "rewards/rejected": -1.7410624027252197, "sft_loss": 1.2642004489898682, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 8.380378848216962, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.3221927285194397, "logits/rejected": -0.14524655044078827, "logps/chosen": -1.2813420295715332, "logps/rejected": -1.8537393808364868, "loss": 0.9835, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2813420295715332, "rewards/margins": 0.5723973512649536, "rewards/rejected": -1.8537393808364868, "sft_loss": 1.3393852710723877, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 7.55389047157114, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.13835462927818298, "logits/rejected": 0.10697861015796661, "logps/chosen": -1.3332964181900024, "logps/rejected": -2.028346300125122, "loss": 0.9493, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3332964181900024, "rewards/margins": 0.6950497627258301, "rewards/rejected": -2.028346300125122, "sft_loss": 1.313429355621338, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 11.316416033813448, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.20318862795829773, "logits/rejected": -0.11733639240264893, "logps/chosen": -1.4059638977050781, "logps/rejected": -1.9184436798095703, "loss": 1.038, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4059638977050781, "rewards/margins": 0.5124797224998474, "rewards/rejected": -1.9184436798095703, "sft_loss": 1.4158484935760498, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 8.200841921982983, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.21803493797779083, "logits/rejected": -0.12491512298583984, "logps/chosen": -1.25760018825531, "logps/rejected": -1.7700488567352295, "loss": 0.9609, "rewards/accuracies": 0.71875, "rewards/chosen": -1.25760018825531, "rewards/margins": 0.5124487280845642, "rewards/rejected": -1.7700488567352295, "sft_loss": 1.3133172988891602, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 12.658528990316567, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.12976686656475067, "logits/rejected": -0.13357266783714294, "logps/chosen": -1.2693363428115845, "logps/rejected": -1.7833693027496338, "loss": 0.9779, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2693363428115845, "rewards/margins": 0.5140329599380493, "rewards/rejected": -1.7833693027496338, "sft_loss": 1.337398648262024, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 10.497517676948384, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.28647470474243164, "logits/rejected": -0.10886237770318985, "logps/chosen": -1.3986010551452637, "logps/rejected": -1.8781064748764038, "loss": 1.0317, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3986010551452637, "rewards/margins": 0.4795055389404297, "rewards/rejected": -1.8781064748764038, "sft_loss": 1.412027359008789, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 8.056840404356597, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.24646127223968506, "logits/rejected": -0.19223013520240784, "logps/chosen": -1.3325896263122559, "logps/rejected": -1.8143196105957031, "loss": 0.9822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3325896263122559, "rewards/margins": 0.48172998428344727, "rewards/rejected": -1.8143196105957031, "sft_loss": 1.343353033065796, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 9.084221826454508, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.21413850784301758, "logits/rejected": -0.0885709673166275, "logps/chosen": -1.3127249479293823, "logps/rejected": -1.7602211236953735, "loss": 0.996, "rewards/accuracies": 0.625, "rewards/chosen": -1.3127249479293823, "rewards/margins": 0.4474961757659912, "rewards/rejected": -1.7602211236953735, "sft_loss": 1.3310400247573853, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 11.449525862847201, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.30147385597229004, "logits/rejected": -0.18163147568702698, "logps/chosen": -1.3303790092468262, "logps/rejected": -1.975589394569397, "loss": 0.9805, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3303790092468262, "rewards/margins": 0.6452105641365051, "rewards/rejected": -1.975589394569397, "sft_loss": 1.419938325881958, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 8.895095429807123, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.2536405324935913, "logits/rejected": -0.05459263175725937, "logps/chosen": -1.3059298992156982, "logps/rejected": -1.970513105392456, "loss": 0.959, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3059298992156982, "rewards/margins": 0.6645833253860474, "rewards/rejected": -1.970513105392456, "sft_loss": 1.3471238613128662, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 12.012076106940423, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.23025648295879364, "logits/rejected": -0.07983642816543579, "logps/chosen": -1.362376093864441, "logps/rejected": -1.7934879064559937, "loss": 1.0226, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.362376093864441, "rewards/margins": 0.4311119616031647, "rewards/rejected": -1.7934879064559937, "sft_loss": 1.4036413431167603, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 8.975404973926123, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.2329825609922409, "logits/rejected": -0.09870786964893341, "logps/chosen": -1.3620847463607788, "logps/rejected": -1.8238025903701782, "loss": 1.0331, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3620847463607788, "rewards/margins": 0.46171754598617554, "rewards/rejected": -1.8238025903701782, "sft_loss": 1.3915579319000244, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 10.39904637190083, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.23698978126049042, "logits/rejected": -0.0770409107208252, "logps/chosen": -1.3104714155197144, "logps/rejected": -1.920532464981079, "loss": 0.998, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3104714155197144, "rewards/margins": 0.61006098985672, "rewards/rejected": -1.920532464981079, "sft_loss": 1.3576505184173584, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 10.299178045423748, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.22072386741638184, "logits/rejected": -0.1066427230834961, "logps/chosen": -1.4406993389129639, "logps/rejected": -1.9226884841918945, "loss": 1.0741, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4406993389129639, "rewards/margins": 0.4819890558719635, "rewards/rejected": -1.9226884841918945, "sft_loss": 1.416477084159851, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 5.747903690377897, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.17060045897960663, "logits/rejected": -0.10004113614559174, "logps/chosen": -1.4150245189666748, "logps/rejected": -1.9153951406478882, "loss": 1.0166, "rewards/accuracies": 0.625, "rewards/chosen": -1.4150245189666748, "rewards/margins": 0.5003708600997925, "rewards/rejected": -1.9153951406478882, "sft_loss": 1.3764393329620361, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 9.207914170551703, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.23539428412914276, "logits/rejected": -0.06189621612429619, "logps/chosen": -1.2505683898925781, "logps/rejected": -1.98164963722229, "loss": 0.9379, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2505683898925781, "rewards/margins": 0.7310811877250671, "rewards/rejected": -1.98164963722229, "sft_loss": 1.3134586811065674, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 13.990820654898867, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.24911001324653625, "logits/rejected": -0.16555368900299072, "logps/chosen": -1.4709831476211548, "logps/rejected": -1.7603830099105835, "loss": 1.1095, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4709831476211548, "rewards/margins": 0.2893998622894287, "rewards/rejected": -1.7603830099105835, "sft_loss": 1.459791898727417, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 7.4323654415717355, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.15631893277168274, "logits/rejected": -0.09263060986995697, "logps/chosen": -1.3558018207550049, "logps/rejected": -1.9541645050048828, "loss": 0.9599, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3558018207550049, "rewards/margins": 0.5983625650405884, "rewards/rejected": -1.9541645050048828, "sft_loss": 1.377393364906311, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 10.017881800957118, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.26166990399360657, "logits/rejected": -0.14959931373596191, "logps/chosen": -1.379631757736206, "logps/rejected": -1.8234903812408447, "loss": 1.0108, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.379631757736206, "rewards/margins": 0.44385847449302673, "rewards/rejected": -1.8234903812408447, "sft_loss": 1.4106724262237549, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 9.705232333372008, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.31485554575920105, "logits/rejected": -0.10140397399663925, "logps/chosen": -1.4201301336288452, "logps/rejected": -1.8692519664764404, "loss": 1.025, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4201301336288452, "rewards/margins": 0.44912201166152954, "rewards/rejected": -1.8692519664764404, "sft_loss": 1.43977952003479, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 9.662438801196549, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.11434853076934814, "logits/rejected": -0.11242060363292694, "logps/chosen": -1.3031599521636963, "logps/rejected": -1.8188354969024658, "loss": 0.9819, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3031599521636963, "rewards/margins": 0.5156753659248352, "rewards/rejected": -1.8188354969024658, "sft_loss": 1.3415518999099731, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 10.933654268222613, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.19458740949630737, "logits/rejected": -0.054971061646938324, "logps/chosen": -1.232940435409546, "logps/rejected": -1.8467018604278564, "loss": 0.929, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.232940435409546, "rewards/margins": 0.6137614846229553, "rewards/rejected": -1.8467018604278564, "sft_loss": 1.2706055641174316, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 7.592807117898535, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.3091137409210205, "logits/rejected": -0.11701484024524689, "logps/chosen": -1.365443468093872, "logps/rejected": -1.9199018478393555, "loss": 1.0253, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.365443468093872, "rewards/margins": 0.5544580817222595, "rewards/rejected": -1.9199018478393555, "sft_loss": 1.44214928150177, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 7.076712114080835, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.240199014544487, "logits/rejected": -0.05072479322552681, "logps/chosen": -1.355287790298462, "logps/rejected": -1.8138258457183838, "loss": 1.0286, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.355287790298462, "rewards/margins": 0.45853790640830994, "rewards/rejected": -1.8138258457183838, "sft_loss": 1.3838344812393188, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 13.56410524646758, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.2634291648864746, "logits/rejected": -0.11340577900409698, "logps/chosen": -1.3418443202972412, "logps/rejected": -1.8998095989227295, "loss": 0.9715, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3418443202972412, "rewards/margins": 0.5579651594161987, "rewards/rejected": -1.8998095989227295, "sft_loss": 1.3820970058441162, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 5.686835263850377, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.23777596652507782, "logits/rejected": -0.10516651719808578, "logps/chosen": -1.288923978805542, "logps/rejected": -1.724108338356018, "loss": 1.0092, "rewards/accuracies": 0.65625, "rewards/chosen": -1.288923978805542, "rewards/margins": 0.43518438935279846, "rewards/rejected": -1.724108338356018, "sft_loss": 1.348914384841919, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 11.53155326090315, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.18005147576332092, "logits/rejected": -0.01693601906299591, "logps/chosen": -1.3340141773223877, "logps/rejected": -1.9913352727890015, "loss": 0.9824, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3340141773223877, "rewards/margins": 0.6573209762573242, "rewards/rejected": -1.9913352727890015, "sft_loss": 1.3819031715393066, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 7.213618434542793, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.2869768440723419, "logits/rejected": -0.13710127770900726, "logps/chosen": -1.3353666067123413, "logps/rejected": -1.9065090417861938, "loss": 0.9676, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3353666067123413, "rewards/margins": 0.5711422562599182, "rewards/rejected": -1.9065090417861938, "sft_loss": 1.3584064245224, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.11583670228719711, "eval_logits/rejected": 0.20641706883907318, "eval_logps/chosen": -1.4061453342437744, "eval_logps/rejected": -1.8540223836898804, "eval_loss": 1.041849136352539, "eval_rewards/accuracies": 0.6231454014778137, "eval_rewards/chosen": -1.4061453342437744, "eval_rewards/margins": 0.44787701964378357, "eval_rewards/rejected": -1.8540223836898804, "eval_runtime": 48.6845, "eval_samples_per_second": 27.627, "eval_sft_loss": 1.4054352045059204, "eval_steps_per_second": 6.922, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 16.531761456585027, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.2997193932533264, "logits/rejected": -0.25852295756340027, "logps/chosen": -1.2969939708709717, "logps/rejected": -1.6899290084838867, "loss": 1.0055, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2969939708709717, "rewards/margins": 0.39293503761291504, "rewards/rejected": -1.6899290084838867, "sft_loss": 1.3143447637557983, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 7.539170451075638, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.21077589690685272, "logits/rejected": -0.1720072478055954, "logps/chosen": -1.408328890800476, "logps/rejected": -2.056427478790283, "loss": 0.9926, "rewards/accuracies": 0.6875, "rewards/chosen": -1.408328890800476, "rewards/margins": 0.6480986475944519, "rewards/rejected": -2.056427478790283, "sft_loss": 1.425734519958496, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 8.089600230518249, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.21279382705688477, "logits/rejected": -0.05223933607339859, "logps/chosen": -1.3679611682891846, "logps/rejected": -1.826300859451294, "loss": 1.0134, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3679611682891846, "rewards/margins": 0.45833978056907654, "rewards/rejected": -1.826300859451294, "sft_loss": 1.3863824605941772, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 10.501997573777427, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.24000367522239685, "logits/rejected": -0.1543852984905243, "logps/chosen": -1.3647658824920654, "logps/rejected": -1.688582420349121, "loss": 1.0335, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3647658824920654, "rewards/margins": 0.3238166272640228, "rewards/rejected": -1.688582420349121, "sft_loss": 1.36466383934021, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 13.152802899680687, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.1994973123073578, "logits/rejected": -0.08989440649747849, "logps/chosen": -1.2513803243637085, "logps/rejected": -1.6642259359359741, "loss": 1.0053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2513803243637085, "rewards/margins": 0.4128456115722656, "rewards/rejected": -1.6642259359359741, "sft_loss": 1.3685729503631592, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 6.3273467397298875, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.2455778419971466, "logits/rejected": -0.07191842049360275, "logps/chosen": -1.3764415979385376, "logps/rejected": -1.8642089366912842, "loss": 1.033, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3764415979385376, "rewards/margins": 0.48776760697364807, "rewards/rejected": -1.8642089366912842, "sft_loss": 1.4149798154830933, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 6.326489480123552, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.3077549934387207, "logits/rejected": -0.08836190402507782, "logps/chosen": -1.262527585029602, "logps/rejected": -1.7917951345443726, "loss": 0.9662, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.262527585029602, "rewards/margins": 0.5292677283287048, "rewards/rejected": -1.7917951345443726, "sft_loss": 1.3449105024337769, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 9.481565949086859, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.31242865324020386, "logits/rejected": -0.09762193262577057, "logps/chosen": -1.3546260595321655, "logps/rejected": -1.7663938999176025, "loss": 1.044, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3546260595321655, "rewards/margins": 0.4117676615715027, "rewards/rejected": -1.7663938999176025, "sft_loss": 1.40595281124115, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 8.325158454192946, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.27985435724258423, "logits/rejected": -0.0948081687092781, "logps/chosen": -1.3356212377548218, "logps/rejected": -1.74164617061615, "loss": 1.0353, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3356212377548218, "rewards/margins": 0.4060249924659729, "rewards/rejected": -1.74164617061615, "sft_loss": 1.3741556406021118, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 13.884407642700152, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.19278530776500702, "logits/rejected": -0.06277438253164291, "logps/chosen": -1.4006202220916748, "logps/rejected": -1.9403584003448486, "loss": 1.0405, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4006202220916748, "rewards/margins": 0.5397380590438843, "rewards/rejected": -1.9403584003448486, "sft_loss": 1.402583360671997, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 11.829877249857724, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2743862271308899, "logits/rejected": -0.17100296914577484, "logps/chosen": -1.3506872653961182, "logps/rejected": -1.8455251455307007, "loss": 1.0189, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3506872653961182, "rewards/margins": 0.4948379099369049, "rewards/rejected": -1.8455251455307007, "sft_loss": 1.395976185798645, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 8.967902356172923, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.30608153343200684, "logits/rejected": -0.0737648606300354, "logps/chosen": -1.2880511283874512, "logps/rejected": -1.9710843563079834, "loss": 0.9693, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2880511283874512, "rewards/margins": 0.6830333471298218, "rewards/rejected": -1.9710843563079834, "sft_loss": 1.3572402000427246, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 13.691400586547886, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.29284581542015076, "logits/rejected": -0.10276402533054352, "logps/chosen": -1.3815476894378662, "logps/rejected": -1.816573143005371, "loss": 1.0459, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3815476894378662, "rewards/margins": 0.43502530455589294, "rewards/rejected": -1.816573143005371, "sft_loss": 1.3989099264144897, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 7.438040374763478, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.1603635549545288, "logits/rejected": -0.08685074001550674, "logps/chosen": -1.2549306154251099, "logps/rejected": -1.7096410989761353, "loss": 0.9981, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2549306154251099, "rewards/margins": 0.4547103941440582, "rewards/rejected": -1.7096410989761353, "sft_loss": 1.2909096479415894, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 12.45344214229761, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.2482972890138626, "logits/rejected": -0.023926924914121628, "logps/chosen": -1.334442377090454, "logps/rejected": -1.807771921157837, "loss": 0.9952, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.334442377090454, "rewards/margins": 0.47332969307899475, "rewards/rejected": -1.807771921157837, "sft_loss": 1.3329417705535889, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 9.739524869817904, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.23262295126914978, "logits/rejected": -0.14341866970062256, "logps/chosen": -1.207165241241455, "logps/rejected": -1.6426270008087158, "loss": 0.9879, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.207165241241455, "rewards/margins": 0.4354618489742279, "rewards/rejected": -1.6426270008087158, "sft_loss": 1.2839418649673462, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 8.846852816642949, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.20950651168823242, "logits/rejected": -0.11397744715213776, "logps/chosen": -1.3568718433380127, "logps/rejected": -1.9146311283111572, "loss": 0.9793, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3568718433380127, "rewards/margins": 0.5577594041824341, "rewards/rejected": -1.9146311283111572, "sft_loss": 1.3494882583618164, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 10.164110735715587, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.2683698236942291, "logits/rejected": -0.04780945926904678, "logps/chosen": -1.2910733222961426, "logps/rejected": -1.9252097606658936, "loss": 0.9495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2910733222961426, "rewards/margins": 0.6341365575790405, "rewards/rejected": -1.9252097606658936, "sft_loss": 1.3434323072433472, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 6.3916695628160545, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.31511589884757996, "logits/rejected": -0.09818333387374878, "logps/chosen": -1.309179425239563, "logps/rejected": -1.787996530532837, "loss": 0.9986, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.309179425239563, "rewards/margins": 0.47881707549095154, "rewards/rejected": -1.787996530532837, "sft_loss": 1.3815486431121826, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 5.9588450975580685, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.12920327484607697, "logits/rejected": -0.1269599050283432, "logps/chosen": -1.3116642236709595, "logps/rejected": -1.7722593545913696, "loss": 1.0026, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3116642236709595, "rewards/margins": 0.4605952203273773, "rewards/rejected": -1.7722593545913696, "sft_loss": 1.3667010068893433, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 10.936536750088786, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.2735688090324402, "logits/rejected": -0.1245633214712143, "logps/chosen": -1.3134715557098389, "logps/rejected": -1.8533337116241455, "loss": 0.965, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3134715557098389, "rewards/margins": 0.5398621559143066, "rewards/rejected": -1.8533337116241455, "sft_loss": 1.350994348526001, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 17.92011875022809, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.2906729280948639, "logits/rejected": -0.1478831022977829, "logps/chosen": -1.3039506673812866, "logps/rejected": -1.7351878881454468, "loss": 0.9837, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3039506673812866, "rewards/margins": 0.43123722076416016, "rewards/rejected": -1.7351878881454468, "sft_loss": 1.2791199684143066, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 6.916233284559286, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.2609194815158844, "logits/rejected": -0.1757608950138092, "logps/chosen": -1.3002293109893799, "logps/rejected": -1.7666009664535522, "loss": 0.9892, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3002293109893799, "rewards/margins": 0.4663717746734619, "rewards/rejected": -1.7666009664535522, "sft_loss": 1.34689462184906, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 7.971397826258169, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.24207957088947296, "logits/rejected": -0.1476575881242752, "logps/chosen": -1.281394362449646, "logps/rejected": -1.8730818033218384, "loss": 0.9371, "rewards/accuracies": 0.6875, "rewards/chosen": -1.281394362449646, "rewards/margins": 0.5916873216629028, "rewards/rejected": -1.8730818033218384, "sft_loss": 1.2919895648956299, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 11.465781260730042, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.3093206286430359, "logits/rejected": -0.12962278723716736, "logps/chosen": -1.4094141721725464, "logps/rejected": -1.9580589532852173, "loss": 1.0531, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4094141721725464, "rewards/margins": 0.5486448407173157, "rewards/rejected": -1.9580589532852173, "sft_loss": 1.4824426174163818, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 12.302336490667379, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.27253293991088867, "logits/rejected": -0.17779898643493652, "logps/chosen": -1.3364722728729248, "logps/rejected": -1.7032201290130615, "loss": 1.0275, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3364722728729248, "rewards/margins": 0.36674779653549194, "rewards/rejected": -1.7032201290130615, "sft_loss": 1.3226144313812256, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 9.169403824852585, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.11461882293224335, "logits/rejected": 0.003153522266075015, "logps/chosen": -1.2885421514511108, "logps/rejected": -1.7717559337615967, "loss": 1.0068, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2885421514511108, "rewards/margins": 0.4832138121128082, "rewards/rejected": -1.7717559337615967, "sft_loss": 1.3321475982666016, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 6.341765185378913, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.1359674036502838, "logits/rejected": -0.057124294340610504, "logps/chosen": -1.2703921794891357, "logps/rejected": -1.7722088098526, "loss": 0.989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2703921794891357, "rewards/margins": 0.5018167495727539, "rewards/rejected": -1.7722088098526, "sft_loss": 1.3047778606414795, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 14.248618673793011, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.18335461616516113, "logits/rejected": -0.04374461621046066, "logps/chosen": -1.2934150695800781, "logps/rejected": -1.7944812774658203, "loss": 1.007, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2934150695800781, "rewards/margins": 0.5010663270950317, "rewards/rejected": -1.7944812774658203, "sft_loss": 1.3242768049240112, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 9.205092096895898, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.25516027212142944, "logits/rejected": -0.1437629908323288, "logps/chosen": -1.2467381954193115, "logps/rejected": -1.7445876598358154, "loss": 0.9732, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2467381954193115, "rewards/margins": 0.49784937500953674, "rewards/rejected": -1.7445876598358154, "sft_loss": 1.276613712310791, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 7.101111088315062, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.19623471796512604, "logits/rejected": -0.12268761545419693, "logps/chosen": -1.3916569948196411, "logps/rejected": -1.7594877481460571, "loss": 1.0586, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3916569948196411, "rewards/margins": 0.367830753326416, "rewards/rejected": -1.7594877481460571, "sft_loss": 1.4670602083206177, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 7.1133746677185785, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.23669187724590302, "logits/rejected": -0.045713476836681366, "logps/chosen": -1.287642478942871, "logps/rejected": -1.7388050556182861, "loss": 1.0283, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.287642478942871, "rewards/margins": 0.45116251707077026, "rewards/rejected": -1.7388050556182861, "sft_loss": 1.2754676342010498, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 9.04971905642823, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.182923823595047, "logits/rejected": -0.09119474142789841, "logps/chosen": -1.38874089717865, "logps/rejected": -1.8274568319320679, "loss": 1.0187, "rewards/accuracies": 0.65625, "rewards/chosen": -1.38874089717865, "rewards/margins": 0.4387160837650299, "rewards/rejected": -1.8274568319320679, "sft_loss": 1.4005342721939087, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 14.466176430071794, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.14486274123191833, "logits/rejected": 0.015377027913928032, "logps/chosen": -1.2977030277252197, "logps/rejected": -1.8409439325332642, "loss": 0.9696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2977030277252197, "rewards/margins": 0.5432409644126892, "rewards/rejected": -1.8409439325332642, "sft_loss": 1.3190866708755493, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 16.65086539067498, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.1298166811466217, "logits/rejected": -0.07689005136489868, "logps/chosen": -1.3006457090377808, "logps/rejected": -1.753061056137085, "loss": 1.009, "rewards/accuracies": 0.625, "rewards/chosen": -1.3006457090377808, "rewards/margins": 0.4524151682853699, "rewards/rejected": -1.753061056137085, "sft_loss": 1.353380799293518, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 10.575069440877996, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.21864600479602814, "logits/rejected": -0.16788628697395325, "logps/chosen": -1.2870771884918213, "logps/rejected": -1.7425334453582764, "loss": 0.9956, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2870771884918213, "rewards/margins": 0.4554562568664551, "rewards/rejected": -1.7425334453582764, "sft_loss": 1.3278234004974365, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 7.601451022217489, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.19092920422554016, "logits/rejected": -0.0491974875330925, "logps/chosen": -1.3717070817947388, "logps/rejected": -1.7525761127471924, "loss": 1.083, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3717070817947388, "rewards/margins": 0.3808690905570984, "rewards/rejected": -1.7525761127471924, "sft_loss": 1.4324686527252197, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 7.488293856429623, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.307849258184433, "logits/rejected": -0.1173395961523056, "logps/chosen": -1.2461965084075928, "logps/rejected": -1.6748631000518799, "loss": 1.0013, "rewards/accuracies": 0.625, "rewards/chosen": -1.2461965084075928, "rewards/margins": 0.4286664128303528, "rewards/rejected": -1.6748631000518799, "sft_loss": 1.3064314126968384, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 18.458289821572553, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.25331372022628784, "logits/rejected": -0.1301385462284088, "logps/chosen": -1.3207085132598877, "logps/rejected": -1.8877767324447632, "loss": 0.9931, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3207085132598877, "rewards/margins": 0.5670682191848755, "rewards/rejected": -1.8877767324447632, "sft_loss": 1.3454453945159912, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 7.635517310171866, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.24821238219738007, "logits/rejected": 0.020019862800836563, "logps/chosen": -1.3479750156402588, "logps/rejected": -1.9506289958953857, "loss": 0.9808, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3479750156402588, "rewards/margins": 0.6026536822319031, "rewards/rejected": -1.9506289958953857, "sft_loss": 1.4187161922454834, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 11.600826497569802, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.17614629864692688, "logits/rejected": -0.031257856637239456, "logps/chosen": -1.3354041576385498, "logps/rejected": -1.9279369115829468, "loss": 1.0027, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3354041576385498, "rewards/margins": 0.5925329327583313, "rewards/rejected": -1.9279369115829468, "sft_loss": 1.3648275136947632, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 7.982304524200157, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.2403148114681244, "logits/rejected": -0.08003642410039902, "logps/chosen": -1.3246963024139404, "logps/rejected": -1.8701080083847046, "loss": 0.9869, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3246963024139404, "rewards/margins": 0.5454118847846985, "rewards/rejected": -1.8701080083847046, "sft_loss": 1.344297170639038, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 9.660647937774073, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.16011041402816772, "logits/rejected": -0.0173188503831625, "logps/chosen": -1.3368616104125977, "logps/rejected": -2.026045560836792, "loss": 0.9754, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3368616104125977, "rewards/margins": 0.6891839504241943, "rewards/rejected": -2.026045560836792, "sft_loss": 1.380816102027893, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 7.359053426609902, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.1726696491241455, "logits/rejected": 0.0374867208302021, "logps/chosen": -1.2261712551116943, "logps/rejected": -1.7587896585464478, "loss": 0.9608, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2261712551116943, "rewards/margins": 0.5326187014579773, "rewards/rejected": -1.7587896585464478, "sft_loss": 1.3288418054580688, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 6.227833451403368, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.3180811405181885, "logits/rejected": -0.2705082595348358, "logps/chosen": -1.33169686794281, "logps/rejected": -1.805450677871704, "loss": 1.0013, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.33169686794281, "rewards/margins": 0.4737536907196045, "rewards/rejected": -1.805450677871704, "sft_loss": 1.3847988843917847, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 7.777287675881731, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.17647233605384827, "logits/rejected": -0.09123236685991287, "logps/chosen": -1.2847613096237183, "logps/rejected": -1.7988321781158447, "loss": 1.0017, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2847613096237183, "rewards/margins": 0.5140709280967712, "rewards/rejected": -1.7988321781158447, "sft_loss": 1.3374969959259033, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 16.697480712369682, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.13744884729385376, "logits/rejected": 0.016100814566016197, "logps/chosen": -1.2873327732086182, "logps/rejected": -1.947933554649353, "loss": 0.9606, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2873327732086182, "rewards/margins": 0.6606006026268005, "rewards/rejected": -1.947933554649353, "sft_loss": 1.306330680847168, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 7.35333599354952, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.1599302589893341, "logits/rejected": -0.05215344950556755, "logps/chosen": -1.3196965456008911, "logps/rejected": -1.8524376153945923, "loss": 0.9713, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3196965456008911, "rewards/margins": 0.532741129398346, "rewards/rejected": -1.8524376153945923, "sft_loss": 1.3606996536254883, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 8.44997951397884, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.19984640181064606, "logits/rejected": -0.06128763407468796, "logps/chosen": -1.294210433959961, "logps/rejected": -1.7580476999282837, "loss": 0.9853, "rewards/accuracies": 0.6875, "rewards/chosen": -1.294210433959961, "rewards/margins": 0.4638374447822571, "rewards/rejected": -1.7580476999282837, "sft_loss": 1.3605902194976807, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 8.153798854643007, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.24238982796669006, "logits/rejected": 0.01320638693869114, "logps/chosen": -1.3412058353424072, "logps/rejected": -1.9187650680541992, "loss": 0.9616, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3412058353424072, "rewards/margins": 0.5775595903396606, "rewards/rejected": -1.9187650680541992, "sft_loss": 1.3528225421905518, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 9.62999907350543, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.26737847924232483, "logits/rejected": -0.11783139407634735, "logps/chosen": -1.3052794933319092, "logps/rejected": -1.743168592453003, "loss": 0.997, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3052794933319092, "rewards/margins": 0.4378891587257385, "rewards/rejected": -1.743168592453003, "sft_loss": 1.3775510787963867, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 9.773237204542284, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.23270268738269806, "logits/rejected": -0.1101439818739891, "logps/chosen": -1.3616409301757812, "logps/rejected": -1.8167692422866821, "loss": 1.0086, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3616409301757812, "rewards/margins": 0.4551284909248352, "rewards/rejected": -1.8167692422866821, "sft_loss": 1.3947080373764038, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 13.46926687344113, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.22592651844024658, "logits/rejected": -0.15461456775665283, "logps/chosen": -1.4279885292053223, "logps/rejected": -1.8917471170425415, "loss": 1.0439, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4279885292053223, "rewards/margins": 0.4637584686279297, "rewards/rejected": -1.8917471170425415, "sft_loss": 1.4091001749038696, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 22.862875623764268, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.23195526003837585, "logits/rejected": -0.09586568176746368, "logps/chosen": -1.3377292156219482, "logps/rejected": -1.708822250366211, "loss": 1.0443, "rewards/accuracies": 0.625, "rewards/chosen": -1.3377292156219482, "rewards/margins": 0.3710930645465851, "rewards/rejected": -1.708822250366211, "sft_loss": 1.4078140258789062, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 11.316385205834427, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.17335200309753418, "logits/rejected": -0.02181796357035637, "logps/chosen": -1.2743074893951416, "logps/rejected": -1.803501844406128, "loss": 0.9722, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2743074893951416, "rewards/margins": 0.5291942358016968, "rewards/rejected": -1.803501844406128, "sft_loss": 1.2910643815994263, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 21.98504110227831, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.15896247327327728, "logits/rejected": -0.049371711909770966, "logps/chosen": -1.3332550525665283, "logps/rejected": -1.9437267780303955, "loss": 0.9747, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3332550525665283, "rewards/margins": 0.6104718446731567, "rewards/rejected": -1.9437267780303955, "sft_loss": 1.3414416313171387, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 11.770340730002667, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.2712337076663971, "logits/rejected": -0.0732741728425026, "logps/chosen": -1.3264377117156982, "logps/rejected": -1.7903629541397095, "loss": 0.9916, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3264377117156982, "rewards/margins": 0.4639252722263336, "rewards/rejected": -1.7903629541397095, "sft_loss": 1.3691695928573608, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 9.051087910498229, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.1903747022151947, "logits/rejected": -0.1403854936361313, "logps/chosen": -1.324100375175476, "logps/rejected": -1.8005352020263672, "loss": 0.999, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.324100375175476, "rewards/margins": 0.4764348864555359, "rewards/rejected": -1.8005352020263672, "sft_loss": 1.3471524715423584, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 7.615968755779422, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.17845022678375244, "logits/rejected": 0.018734300509095192, "logps/chosen": -1.3418115377426147, "logps/rejected": -1.8535064458847046, "loss": 1.0332, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3418115377426147, "rewards/margins": 0.5116950869560242, "rewards/rejected": -1.8535064458847046, "sft_loss": 1.3775501251220703, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 6.537479023406544, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.27379363775253296, "logits/rejected": -0.14433471858501434, "logps/chosen": -1.2867431640625, "logps/rejected": -1.788644790649414, "loss": 1.0146, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2867431640625, "rewards/margins": 0.5019017457962036, "rewards/rejected": -1.788644790649414, "sft_loss": 1.3657176494598389, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 12.955282547417086, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.14411696791648865, "logits/rejected": -0.10907924175262451, "logps/chosen": -1.3774569034576416, "logps/rejected": -1.8264074325561523, "loss": 1.0335, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3774569034576416, "rewards/margins": 0.44895029067993164, "rewards/rejected": -1.8264074325561523, "sft_loss": 1.369371771812439, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 10.083790285634256, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.24078384041786194, "logits/rejected": -0.08013347536325455, "logps/chosen": -1.3708370923995972, "logps/rejected": -1.9471477270126343, "loss": 0.9875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3708370923995972, "rewards/margins": 0.5763106346130371, "rewards/rejected": -1.9471477270126343, "sft_loss": 1.3982982635498047, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 9.32421974678991, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.21982285380363464, "logits/rejected": -0.18153129518032074, "logps/chosen": -1.4248440265655518, "logps/rejected": -1.8908706903457642, "loss": 1.0332, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4248440265655518, "rewards/margins": 0.4660266935825348, "rewards/rejected": -1.8908706903457642, "sft_loss": 1.477002739906311, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 11.542221088014736, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.1805545836687088, "logits/rejected": -0.07476134598255157, "logps/chosen": -1.2300540208816528, "logps/rejected": -1.7072839736938477, "loss": 0.9635, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2300540208816528, "rewards/margins": 0.4772297739982605, "rewards/rejected": -1.7072839736938477, "sft_loss": 1.300767183303833, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 11.64707144335193, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.24052152037620544, "logits/rejected": -0.176700621843338, "logps/chosen": -1.3545494079589844, "logps/rejected": -1.7904884815216064, "loss": 1.021, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3545494079589844, "rewards/margins": 0.43593907356262207, "rewards/rejected": -1.7904884815216064, "sft_loss": 1.394141674041748, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 7.904436255203085, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.17813178896903992, "logits/rejected": -0.05374855920672417, "logps/chosen": -1.2366796731948853, "logps/rejected": -1.7965199947357178, "loss": 0.9679, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2366796731948853, "rewards/margins": 0.5598403811454773, "rewards/rejected": -1.7965199947357178, "sft_loss": 1.3370568752288818, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 7.4402860611881465, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.1650870144367218, "logits/rejected": -0.2268679440021515, "logps/chosen": -1.3688818216323853, "logps/rejected": -1.739956259727478, "loss": 1.0155, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3688818216323853, "rewards/margins": 0.37107449769973755, "rewards/rejected": -1.739956259727478, "sft_loss": 1.3951529264450073, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 7.044974235801137, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.2574231028556824, "logits/rejected": -0.10612340271472931, "logps/chosen": -1.3511667251586914, "logps/rejected": -1.880702257156372, "loss": 0.9782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3511667251586914, "rewards/margins": 0.5295354127883911, "rewards/rejected": -1.880702257156372, "sft_loss": 1.377986192703247, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 10.285582997631959, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.170863077044487, "logits/rejected": -0.12588295340538025, "logps/chosen": -1.3761823177337646, "logps/rejected": -1.852365493774414, "loss": 1.0204, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3761823177337646, "rewards/margins": 0.4761830270290375, "rewards/rejected": -1.852365493774414, "sft_loss": 1.3957990407943726, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 6.546259057130183, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.1758599430322647, "logits/rejected": 0.03030111826956272, "logps/chosen": -1.297829031944275, "logps/rejected": -2.0442402362823486, "loss": 0.9007, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.297829031944275, "rewards/margins": 0.746411144733429, "rewards/rejected": -2.0442402362823486, "sft_loss": 1.3134304285049438, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 8.301983409544098, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.24078845977783203, "logits/rejected": -0.05983034521341324, "logps/chosen": -1.232013463973999, "logps/rejected": -1.936300277709961, "loss": 0.9475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.232013463973999, "rewards/margins": 0.7042867541313171, "rewards/rejected": -1.936300277709961, "sft_loss": 1.2897391319274902, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 5.974846801942517, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.27302032709121704, "logits/rejected": -0.124132439494133, "logps/chosen": -1.2491251230239868, "logps/rejected": -1.7809514999389648, "loss": 0.9751, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2491251230239868, "rewards/margins": 0.5318263173103333, "rewards/rejected": -1.7809514999389648, "sft_loss": 1.3279565572738647, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 12.996647544401874, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.2610344886779785, "logits/rejected": -0.07429017871618271, "logps/chosen": -1.3629835844039917, "logps/rejected": -1.8292680978775024, "loss": 1.0442, "rewards/accuracies": 0.625, "rewards/chosen": -1.3629835844039917, "rewards/margins": 0.4662845730781555, "rewards/rejected": -1.8292680978775024, "sft_loss": 1.3952610492706299, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 10.682258574204342, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.2108164131641388, "logits/rejected": -0.09726519882678986, "logps/chosen": -1.3561142683029175, "logps/rejected": -1.9013593196868896, "loss": 0.9947, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3561142683029175, "rewards/margins": 0.5452449321746826, "rewards/rejected": -1.9013593196868896, "sft_loss": 1.3703248500823975, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 8.396879522182823, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.08900733292102814, "logits/rejected": 0.02392936311662197, "logps/chosen": -1.2975869178771973, "logps/rejected": -1.862375020980835, "loss": 0.9622, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2975869178771973, "rewards/margins": 0.5647882223129272, "rewards/rejected": -1.862375020980835, "sft_loss": 1.2736012935638428, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 7.986055069676312, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.15881235897541046, "logits/rejected": -0.06122690439224243, "logps/chosen": -1.3280928134918213, "logps/rejected": -1.8025190830230713, "loss": 0.9947, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3280928134918213, "rewards/margins": 0.47442618012428284, "rewards/rejected": -1.8025190830230713, "sft_loss": 1.3553526401519775, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 8.648469385118139, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.16575714945793152, "logits/rejected": -0.10236098617315292, "logps/chosen": -1.33333420753479, "logps/rejected": -1.7184255123138428, "loss": 1.0239, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.33333420753479, "rewards/margins": 0.3850913643836975, "rewards/rejected": -1.7184255123138428, "sft_loss": 1.3617249727249146, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 9.109556486343607, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.2730414867401123, "logits/rejected": -0.12491519749164581, "logps/chosen": -1.3478052616119385, "logps/rejected": -1.9820867776870728, "loss": 0.963, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3478052616119385, "rewards/margins": 0.6342812776565552, "rewards/rejected": -1.9820867776870728, "sft_loss": 1.3628085851669312, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 17.90572280189856, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.15778687596321106, "logits/rejected": -0.07748770713806152, "logps/chosen": -1.366068959236145, "logps/rejected": -1.822178840637207, "loss": 1.0242, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.366068959236145, "rewards/margins": 0.4561101496219635, "rewards/rejected": -1.822178840637207, "sft_loss": 1.3762224912643433, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 6.340516969551728, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.2035118043422699, "logits/rejected": -0.08557195216417313, "logps/chosen": -1.2682950496673584, "logps/rejected": -1.7354037761688232, "loss": 0.9789, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2682950496673584, "rewards/margins": 0.46710890531539917, "rewards/rejected": -1.7354037761688232, "sft_loss": 1.3428736925125122, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.09956033527851105, "eval_logits/rejected": 0.18866664171218872, "eval_logps/chosen": -1.3974061012268066, "eval_logps/rejected": -1.8380084037780762, "eval_loss": 1.0419869422912598, "eval_rewards/accuracies": 0.6142433285713196, "eval_rewards/chosen": -1.3974061012268066, "eval_rewards/margins": 0.4406021535396576, "eval_rewards/rejected": -1.8380084037780762, "eval_runtime": 48.2365, "eval_samples_per_second": 27.883, "eval_sft_loss": 1.4008758068084717, "eval_steps_per_second": 6.986, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 12.243034491589519, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.3126266300678253, "logits/rejected": -0.18945705890655518, "logps/chosen": -1.3755854368209839, "logps/rejected": -1.9291465282440186, "loss": 0.9907, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3755854368209839, "rewards/margins": 0.5535610914230347, "rewards/rejected": -1.9291465282440186, "sft_loss": 1.392539381980896, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 9.067815560279103, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.17166857421398163, "logits/rejected": -0.03303740173578262, "logps/chosen": -1.394575834274292, "logps/rejected": -1.859715223312378, "loss": 1.0549, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.394575834274292, "rewards/margins": 0.4651394784450531, "rewards/rejected": -1.859715223312378, "sft_loss": 1.4112951755523682, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 7.556783097027108, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.16894081234931946, "logits/rejected": -0.034742534160614014, "logps/chosen": -1.206220269203186, "logps/rejected": -1.8776988983154297, "loss": 0.927, "rewards/accuracies": 0.71875, "rewards/chosen": -1.206220269203186, "rewards/margins": 0.6714786291122437, "rewards/rejected": -1.8776988983154297, "sft_loss": 1.2821996212005615, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 8.883994735413456, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.14478550851345062, "logits/rejected": -0.06053388863801956, "logps/chosen": -1.3819754123687744, "logps/rejected": -1.8339745998382568, "loss": 1.0138, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3819754123687744, "rewards/margins": 0.4519991874694824, "rewards/rejected": -1.8339745998382568, "sft_loss": 1.3619210720062256, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 10.631060590250604, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.15467044711112976, "logits/rejected": -0.027900194749236107, "logps/chosen": -1.306641936302185, "logps/rejected": -1.7456728219985962, "loss": 0.9949, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.306641936302185, "rewards/margins": 0.43903082609176636, "rewards/rejected": -1.7456728219985962, "sft_loss": 1.302283525466919, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 6.9724462295350085, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.23680837452411652, "logits/rejected": -0.08103035390377045, "logps/chosen": -1.360740303993225, "logps/rejected": -1.769418716430664, "loss": 1.0285, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.360740303993225, "rewards/margins": 0.4086781442165375, "rewards/rejected": -1.769418716430664, "sft_loss": 1.3670880794525146, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 9.983272955806635, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.3214946389198303, "logits/rejected": -0.1256788820028305, "logps/chosen": -1.3193713426589966, "logps/rejected": -1.8835500478744507, "loss": 0.981, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3193713426589966, "rewards/margins": 0.5641787052154541, "rewards/rejected": -1.8835500478744507, "sft_loss": 1.346934199333191, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 8.038456616467183, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.27449679374694824, "logits/rejected": -0.07452504336833954, "logps/chosen": -1.3287158012390137, "logps/rejected": -1.82450270652771, "loss": 0.9938, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3287158012390137, "rewards/margins": 0.49578672647476196, "rewards/rejected": -1.82450270652771, "sft_loss": 1.376766324043274, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 9.078399651470578, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.243708997964859, "logits/rejected": -0.1337020993232727, "logps/chosen": -1.3256762027740479, "logps/rejected": -1.828753113746643, "loss": 1.0052, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3256762027740479, "rewards/margins": 0.5030766725540161, "rewards/rejected": -1.828753113746643, "sft_loss": 1.3922346830368042, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 14.233014918580938, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.3622373640537262, "logits/rejected": -0.19894781708717346, "logps/chosen": -1.3045145273208618, "logps/rejected": -1.907910943031311, "loss": 0.9678, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3045145273208618, "rewards/margins": 0.603396475315094, "rewards/rejected": -1.907910943031311, "sft_loss": 1.2965425252914429, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 7.813337647316198, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.17479762434959412, "logits/rejected": -0.0063132173381745815, "logps/chosen": -1.259197473526001, "logps/rejected": -1.8022098541259766, "loss": 1.0029, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.259197473526001, "rewards/margins": 0.5430123209953308, "rewards/rejected": -1.8022098541259766, "sft_loss": 1.326766014099121, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 12.79303668954908, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.23360638320446014, "logits/rejected": -0.16707371175289154, "logps/chosen": -1.2304644584655762, "logps/rejected": -1.8639189004898071, "loss": 0.9079, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2304644584655762, "rewards/margins": 0.6334545612335205, "rewards/rejected": -1.8639189004898071, "sft_loss": 1.2692384719848633, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 7.443452034913313, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.34079301357269287, "logits/rejected": -0.24996092915534973, "logps/chosen": -1.2894681692123413, "logps/rejected": -1.858069658279419, "loss": 0.9614, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2894681692123413, "rewards/margins": 0.5686012506484985, "rewards/rejected": -1.858069658279419, "sft_loss": 1.329217791557312, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 12.395401520728653, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.17850585281848907, "logits/rejected": -0.04492710903286934, "logps/chosen": -1.32509183883667, "logps/rejected": -1.837829828262329, "loss": 0.9917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.32509183883667, "rewards/margins": 0.5127378702163696, "rewards/rejected": -1.837829828262329, "sft_loss": 1.3318425416946411, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 8.33277923667889, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.06016062572598457, "logits/rejected": 0.016124781221151352, "logps/chosen": -1.2886050939559937, "logps/rejected": -1.9926360845565796, "loss": 0.9573, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2886050939559937, "rewards/margins": 0.7040311694145203, "rewards/rejected": -1.9926360845565796, "sft_loss": 1.347915768623352, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 12.743055750152957, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.13470612466335297, "logits/rejected": -0.010785120539367199, "logps/chosen": -1.2654292583465576, "logps/rejected": -1.8969109058380127, "loss": 0.9741, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2654292583465576, "rewards/margins": 0.6314815282821655, "rewards/rejected": -1.8969109058380127, "sft_loss": 1.3268860578536987, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 9.596905530882646, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.14197297394275665, "logits/rejected": -0.06500792503356934, "logps/chosen": -1.4103431701660156, "logps/rejected": -1.9615188837051392, "loss": 1.0443, "rewards/accuracies": 0.625, "rewards/chosen": -1.4103431701660156, "rewards/margins": 0.5511755347251892, "rewards/rejected": -1.9615188837051392, "sft_loss": 1.5112073421478271, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 9.503153296695984, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.3041810095310211, "logits/rejected": -0.13739736378192902, "logps/chosen": -1.3125327825546265, "logps/rejected": -1.88010573387146, "loss": 0.9772, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3125327825546265, "rewards/margins": 0.5675729513168335, "rewards/rejected": -1.88010573387146, "sft_loss": 1.3756803274154663, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 7.885282658627051, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.2626744210720062, "logits/rejected": -0.17322595417499542, "logps/chosen": -1.3757003545761108, "logps/rejected": -1.753072738647461, "loss": 1.0796, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3757003545761108, "rewards/margins": 0.37737223505973816, "rewards/rejected": -1.753072738647461, "sft_loss": 1.458059549331665, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 6.666763867558161, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.15900710225105286, "logits/rejected": -0.023568501695990562, "logps/chosen": -1.2843879461288452, "logps/rejected": -1.8810021877288818, "loss": 0.9673, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2843879461288452, "rewards/margins": 0.5966143012046814, "rewards/rejected": -1.8810021877288818, "sft_loss": 1.3582007884979248, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 6.382501308102608, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.21288824081420898, "logits/rejected": -0.05509115010499954, "logps/chosen": -1.3490147590637207, "logps/rejected": -1.9092353582382202, "loss": 1.0173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3490147590637207, "rewards/margins": 0.5602205395698547, "rewards/rejected": -1.9092353582382202, "sft_loss": 1.4373271465301514, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 8.782343504072141, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.23608019948005676, "logits/rejected": -0.2559540271759033, "logps/chosen": -1.3334243297576904, "logps/rejected": -2.139165163040161, "loss": 0.9426, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3334243297576904, "rewards/margins": 0.8057405352592468, "rewards/rejected": -2.139165163040161, "sft_loss": 1.3623217344284058, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 10.02999649415756, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.16636885702610016, "logits/rejected": -0.058062683790922165, "logps/chosen": -1.2314822673797607, "logps/rejected": -1.8077586889266968, "loss": 0.9453, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2314822673797607, "rewards/margins": 0.5762763023376465, "rewards/rejected": -1.8077586889266968, "sft_loss": 1.3330705165863037, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 18.166548359394767, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.14276938140392303, "logits/rejected": -0.05258003622293472, "logps/chosen": -1.3031578063964844, "logps/rejected": -1.8209069967269897, "loss": 1.0079, "rewards/accuracies": 0.625, "rewards/chosen": -1.3031578063964844, "rewards/margins": 0.5177491903305054, "rewards/rejected": -1.8209069967269897, "sft_loss": 1.3699431419372559, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 11.801562049253334, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.36980322003364563, "logits/rejected": -0.11403369903564453, "logps/chosen": -1.2860276699066162, "logps/rejected": -1.892530083656311, "loss": 0.9628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2860276699066162, "rewards/margins": 0.6065024137496948, "rewards/rejected": -1.892530083656311, "sft_loss": 1.355703592300415, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 9.188807326966096, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.17843613028526306, "logits/rejected": -0.025499451905488968, "logps/chosen": -1.4194364547729492, "logps/rejected": -2.0837531089782715, "loss": 0.9735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4194364547729492, "rewards/margins": 0.6643165349960327, "rewards/rejected": -2.0837531089782715, "sft_loss": 1.3846460580825806, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 10.722029916958498, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.26976945996284485, "logits/rejected": -0.10888000577688217, "logps/chosen": -1.3152145147323608, "logps/rejected": -1.7785648107528687, "loss": 1.031, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3152145147323608, "rewards/margins": 0.46335023641586304, "rewards/rejected": -1.7785648107528687, "sft_loss": 1.3738157749176025, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 6.422919879862994, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.21269111335277557, "logits/rejected": -0.13011713325977325, "logps/chosen": -1.3373366594314575, "logps/rejected": -1.6350433826446533, "loss": 1.0387, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3373366594314575, "rewards/margins": 0.29770660400390625, "rewards/rejected": -1.6350433826446533, "sft_loss": 1.350295066833496, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 7.621540725374221, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.22732429206371307, "logits/rejected": -0.025684243068099022, "logps/chosen": -1.304892897605896, "logps/rejected": -2.0208868980407715, "loss": 0.9539, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.304892897605896, "rewards/margins": 0.7159940600395203, "rewards/rejected": -2.0208868980407715, "sft_loss": 1.3593806028366089, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 8.196773582663084, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.2208227664232254, "logits/rejected": -0.08531433343887329, "logps/chosen": -1.3079783916473389, "logps/rejected": -1.8024765253067017, "loss": 0.9937, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3079783916473389, "rewards/margins": 0.4944981038570404, "rewards/rejected": -1.8024765253067017, "sft_loss": 1.3162072896957397, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 9.310700403381741, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.22234082221984863, "logits/rejected": -0.22068548202514648, "logps/chosen": -1.3351229429244995, "logps/rejected": -1.8221311569213867, "loss": 1.0185, "rewards/accuracies": 0.625, "rewards/chosen": -1.3351229429244995, "rewards/margins": 0.48700833320617676, "rewards/rejected": -1.8221311569213867, "sft_loss": 1.3827526569366455, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 8.80574012157952, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.12468080222606659, "logits/rejected": -0.0179353766143322, "logps/chosen": -1.3131933212280273, "logps/rejected": -1.8085600137710571, "loss": 0.9783, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3131933212280273, "rewards/margins": 0.4953668713569641, "rewards/rejected": -1.8085600137710571, "sft_loss": 1.3201087713241577, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 7.285871873326477, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.2541118264198303, "logits/rejected": -0.12958697974681854, "logps/chosen": -1.2113314867019653, "logps/rejected": -1.7473266124725342, "loss": 0.9582, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2113314867019653, "rewards/margins": 0.5359951257705688, "rewards/rejected": -1.7473266124725342, "sft_loss": 1.306854486465454, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 14.073938284183036, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.13873368501663208, "logits/rejected": -0.06717085093259811, "logps/chosen": -1.2868283987045288, "logps/rejected": -1.8329023122787476, "loss": 0.9415, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2868283987045288, "rewards/margins": 0.5460739135742188, "rewards/rejected": -1.8329023122787476, "sft_loss": 1.2820491790771484, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 9.513931999414476, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.2162855565547943, "logits/rejected": -0.16296498477458954, "logps/chosen": -1.3681292533874512, "logps/rejected": -1.8026014566421509, "loss": 1.0238, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3681292533874512, "rewards/margins": 0.4344722628593445, "rewards/rejected": -1.8026014566421509, "sft_loss": 1.3663793802261353, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 8.955292464980037, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.11974634230136871, "logits/rejected": -0.04977312684059143, "logps/chosen": -1.4495785236358643, "logps/rejected": -1.9651418924331665, "loss": 1.0168, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4495785236358643, "rewards/margins": 0.5155635476112366, "rewards/rejected": -1.9651418924331665, "sft_loss": 1.3548001050949097, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 10.663220628394305, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.15686583518981934, "logits/rejected": 0.01727641560137272, "logps/chosen": -1.2853853702545166, "logps/rejected": -1.9318078756332397, "loss": 0.9685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2853853702545166, "rewards/margins": 0.6464226841926575, "rewards/rejected": -1.9318078756332397, "sft_loss": 1.2928669452667236, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 11.53115506279403, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.289564311504364, "logits/rejected": -0.07157912105321884, "logps/chosen": -1.2882611751556396, "logps/rejected": -1.8648605346679688, "loss": 0.9553, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2882611751556396, "rewards/margins": 0.5765994787216187, "rewards/rejected": -1.8648605346679688, "sft_loss": 1.3371655941009521, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 6.390690113174573, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.2623346745967865, "logits/rejected": -0.24493210017681122, "logps/chosen": -1.3538005352020264, "logps/rejected": -1.7726166248321533, "loss": 1.023, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3538005352020264, "rewards/margins": 0.418815940618515, "rewards/rejected": -1.7726166248321533, "sft_loss": 1.3910917043685913, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 6.85135820752156, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.22708475589752197, "logits/rejected": -0.20830416679382324, "logps/chosen": -1.3401455879211426, "logps/rejected": -1.891880750656128, "loss": 0.9958, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3401455879211426, "rewards/margins": 0.5517350435256958, "rewards/rejected": -1.891880750656128, "sft_loss": 1.3599205017089844, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 9.741872318805992, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.1690005362033844, "logits/rejected": -0.11986930668354034, "logps/chosen": -1.3109533786773682, "logps/rejected": -1.815737009048462, "loss": 0.9727, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3109533786773682, "rewards/margins": 0.5047835111618042, "rewards/rejected": -1.815737009048462, "sft_loss": 1.3245909214019775, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 8.275910236168725, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.14948007464408875, "logits/rejected": -0.049628086388111115, "logps/chosen": -1.4493799209594727, "logps/rejected": -1.9683153629302979, "loss": 1.0734, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4493799209594727, "rewards/margins": 0.5189353227615356, "rewards/rejected": -1.9683153629302979, "sft_loss": 1.4960986375808716, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 10.495542277964995, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.2823304831981659, "logits/rejected": -0.21650946140289307, "logps/chosen": -1.3710321187973022, "logps/rejected": -1.846612572669983, "loss": 0.9938, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3710321187973022, "rewards/margins": 0.47558069229125977, "rewards/rejected": -1.846612572669983, "sft_loss": 1.3259201049804688, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 13.036528693824037, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.2800753116607666, "logits/rejected": -0.08348731696605682, "logps/chosen": -1.2873401641845703, "logps/rejected": -1.763296127319336, "loss": 0.9832, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2873401641845703, "rewards/margins": 0.47595587372779846, "rewards/rejected": -1.763296127319336, "sft_loss": 1.3176186084747314, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 8.752888024463486, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.2731943428516388, "logits/rejected": -0.1365620493888855, "logps/chosen": -1.3452825546264648, "logps/rejected": -1.8296034336090088, "loss": 1.0007, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3452825546264648, "rewards/margins": 0.4843209385871887, "rewards/rejected": -1.8296034336090088, "sft_loss": 1.337990164756775, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 6.755409189563525, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.20813412964344025, "logits/rejected": -0.13779008388519287, "logps/chosen": -1.3276698589324951, "logps/rejected": -1.9187484979629517, "loss": 0.9926, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3276698589324951, "rewards/margins": 0.5910786390304565, "rewards/rejected": -1.9187484979629517, "sft_loss": 1.333012342453003, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 14.67877774359697, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.2622719407081604, "logits/rejected": -0.13872763514518738, "logps/chosen": -1.3953707218170166, "logps/rejected": -1.8387365341186523, "loss": 1.0281, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3953707218170166, "rewards/margins": 0.4433657228946686, "rewards/rejected": -1.8387365341186523, "sft_loss": 1.4280171394348145, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 10.452396185649386, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.014820876531302929, "logits/rejected": -0.058113228529691696, "logps/chosen": -1.3324902057647705, "logps/rejected": -1.8379656076431274, "loss": 0.9951, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3324902057647705, "rewards/margins": 0.5054755210876465, "rewards/rejected": -1.8379656076431274, "sft_loss": 1.342046856880188, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 10.706514654691368, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.1659650355577469, "logits/rejected": -0.13472406566143036, "logps/chosen": -1.2917453050613403, "logps/rejected": -1.7612578868865967, "loss": 0.9835, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2917453050613403, "rewards/margins": 0.4695127606391907, "rewards/rejected": -1.7612578868865967, "sft_loss": 1.319317102432251, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 7.839518418772648, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.22668242454528809, "logits/rejected": -0.16178184747695923, "logps/chosen": -1.185306429862976, "logps/rejected": -1.7473878860473633, "loss": 0.9152, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.185306429862976, "rewards/margins": 0.5620813369750977, "rewards/rejected": -1.7473878860473633, "sft_loss": 1.2352666854858398, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 10.126397618064189, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.322266548871994, "logits/rejected": -0.23900584876537323, "logps/chosen": -1.2218904495239258, "logps/rejected": -1.7490746974945068, "loss": 0.9703, "rewards/accuracies": 0.625, "rewards/chosen": -1.2218904495239258, "rewards/margins": 0.5271841883659363, "rewards/rejected": -1.7490746974945068, "sft_loss": 1.2784899473190308, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 15.316353952433701, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.1940038502216339, "logits/rejected": -0.12784381210803986, "logps/chosen": -1.3368351459503174, "logps/rejected": -1.7661672830581665, "loss": 1.0231, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3368351459503174, "rewards/margins": 0.42933225631713867, "rewards/rejected": -1.7661672830581665, "sft_loss": 1.4155434370040894, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 17.394926694549877, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.23021197319030762, "logits/rejected": 0.012189358472824097, "logps/chosen": -1.347095251083374, "logps/rejected": -1.8356088399887085, "loss": 0.9938, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.347095251083374, "rewards/margins": 0.4885135293006897, "rewards/rejected": -1.8356088399887085, "sft_loss": 1.3754655122756958, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 11.886876249226376, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.11982126533985138, "logits/rejected": 0.03776576370000839, "logps/chosen": -1.2209315299987793, "logps/rejected": -1.650206208229065, "loss": 0.9695, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2209315299987793, "rewards/margins": 0.4292744994163513, "rewards/rejected": -1.650206208229065, "sft_loss": 1.2713295221328735, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 15.403235946354835, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.18403539061546326, "logits/rejected": -0.1058233380317688, "logps/chosen": -1.2756471633911133, "logps/rejected": -1.5969749689102173, "loss": 1.0286, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2756471633911133, "rewards/margins": 0.32132771611213684, "rewards/rejected": -1.5969749689102173, "sft_loss": 1.3122966289520264, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 6.884410383974212, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.23184525966644287, "logits/rejected": -0.09298277646303177, "logps/chosen": -1.3421884775161743, "logps/rejected": -1.8177179098129272, "loss": 0.9939, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3421884775161743, "rewards/margins": 0.4755295217037201, "rewards/rejected": -1.8177179098129272, "sft_loss": 1.3477542400360107, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 17.033262401083658, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.2996065616607666, "logits/rejected": -0.1711694747209549, "logps/chosen": -1.2369778156280518, "logps/rejected": -1.918890357017517, "loss": 0.9122, "rewards/accuracies": 0.75, "rewards/chosen": -1.2369778156280518, "rewards/margins": 0.6819124817848206, "rewards/rejected": -1.918890357017517, "sft_loss": 1.2690715789794922, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 16.15761950655529, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.3268240690231323, "logits/rejected": -0.1671230047941208, "logps/chosen": -1.3656808137893677, "logps/rejected": -1.9123808145523071, "loss": 1.0189, "rewards/accuracies": 0.625, "rewards/chosen": -1.3656808137893677, "rewards/margins": 0.5466999411582947, "rewards/rejected": -1.9123808145523071, "sft_loss": 1.3982808589935303, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 7.783888684416137, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.3442414402961731, "logits/rejected": -0.1430184692144394, "logps/chosen": -1.286874532699585, "logps/rejected": -1.9325740337371826, "loss": 0.9666, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.286874532699585, "rewards/margins": 0.6456994414329529, "rewards/rejected": -1.9325740337371826, "sft_loss": 1.3336836099624634, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 10.148748409509505, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.14277896285057068, "logits/rejected": 0.0038436322938650846, "logps/chosen": -1.2554153203964233, "logps/rejected": -2.050020933151245, "loss": 0.9078, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2554153203964233, "rewards/margins": 0.7946058511734009, "rewards/rejected": -2.050020933151245, "sft_loss": 1.2769153118133545, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 12.320620324937778, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.20719130337238312, "logits/rejected": -0.04225752875208855, "logps/chosen": -1.3282561302185059, "logps/rejected": -1.8030502796173096, "loss": 0.9795, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3282561302185059, "rewards/margins": 0.4747942388057709, "rewards/rejected": -1.8030502796173096, "sft_loss": 1.3411647081375122, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 8.552167395329858, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.2896295487880707, "logits/rejected": -0.14427712559700012, "logps/chosen": -1.325205683708191, "logps/rejected": -1.812347173690796, "loss": 0.996, "rewards/accuracies": 0.625, "rewards/chosen": -1.325205683708191, "rewards/margins": 0.4871414303779602, "rewards/rejected": -1.812347173690796, "sft_loss": 1.332587718963623, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 8.47360607592217, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.3206334710121155, "logits/rejected": -0.10871531069278717, "logps/chosen": -1.2992509603500366, "logps/rejected": -1.9598051309585571, "loss": 0.9699, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2992509603500366, "rewards/margins": 0.6605542302131653, "rewards/rejected": -1.9598051309585571, "sft_loss": 1.3578704595565796, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 12.042241142188479, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.2356669157743454, "logits/rejected": -0.17199601233005524, "logps/chosen": -1.4252959489822388, "logps/rejected": -2.001298189163208, "loss": 0.9816, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4252959489822388, "rewards/margins": 0.5760020017623901, "rewards/rejected": -2.001298189163208, "sft_loss": 1.4144552946090698, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 27.851101467081993, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.34884732961654663, "logits/rejected": -0.12272150814533234, "logps/chosen": -1.2981888055801392, "logps/rejected": -1.770521879196167, "loss": 1.0054, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2981888055801392, "rewards/margins": 0.47233304381370544, "rewards/rejected": -1.770521879196167, "sft_loss": 1.3615667819976807, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 6.017299596428813, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.4026873707771301, "logits/rejected": -0.20588211715221405, "logps/chosen": -1.291322112083435, "logps/rejected": -1.9024931192398071, "loss": 0.9745, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.291322112083435, "rewards/margins": 0.6111709475517273, "rewards/rejected": -1.9024931192398071, "sft_loss": 1.3745701313018799, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 9.072085408916156, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.15757055580615997, "logits/rejected": -0.08821666985750198, "logps/chosen": -1.319150686264038, "logps/rejected": -1.785361647605896, "loss": 0.9952, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.319150686264038, "rewards/margins": 0.4662107825279236, "rewards/rejected": -1.785361647605896, "sft_loss": 1.3475697040557861, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 7.2791490733710225, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.16408401727676392, "logits/rejected": -0.02998713217675686, "logps/chosen": -1.3824737071990967, "logps/rejected": -1.8074893951416016, "loss": 1.0441, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3824737071990967, "rewards/margins": 0.42501574754714966, "rewards/rejected": -1.8074893951416016, "sft_loss": 1.4203590154647827, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 9.76020026843918, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.23930442333221436, "logits/rejected": -0.06129174679517746, "logps/chosen": -1.2756531238555908, "logps/rejected": -1.7888301610946655, "loss": 0.9928, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2756531238555908, "rewards/margins": 0.5131770968437195, "rewards/rejected": -1.7888301610946655, "sft_loss": 1.3264212608337402, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 7.203198297540759, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.30419832468032837, "logits/rejected": -0.18057909607887268, "logps/chosen": -1.2716641426086426, "logps/rejected": -1.7480897903442383, "loss": 0.9584, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2716641426086426, "rewards/margins": 0.4764255881309509, "rewards/rejected": -1.7480897903442383, "sft_loss": 1.31839120388031, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 7.7771625212828, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.29892662167549133, "logits/rejected": -0.1932951807975769, "logps/chosen": -1.3394827842712402, "logps/rejected": -1.8271324634552002, "loss": 0.9999, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3394827842712402, "rewards/margins": 0.4876495897769928, "rewards/rejected": -1.8271324634552002, "sft_loss": 1.4048898220062256, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 6.302749082938999, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.25966915488243103, "logits/rejected": -0.19929108023643494, "logps/chosen": -1.2809369564056396, "logps/rejected": -1.862352967262268, "loss": 0.9495, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2809369564056396, "rewards/margins": 0.5814159512519836, "rewards/rejected": -1.862352967262268, "sft_loss": 1.3612562417984009, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 9.166286039381573, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.23846633732318878, "logits/rejected": -0.04908045381307602, "logps/chosen": -1.2551108598709106, "logps/rejected": -1.8049733638763428, "loss": 0.9493, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2551108598709106, "rewards/margins": 0.5498624444007874, "rewards/rejected": -1.8049733638763428, "sft_loss": 1.3031388521194458, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 6.057023934672095, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.21795444190502167, "logits/rejected": -0.08611872047185898, "logps/chosen": -1.3664335012435913, "logps/rejected": -1.8410437107086182, "loss": 1.0505, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3664335012435913, "rewards/margins": 0.4746101498603821, "rewards/rejected": -1.8410437107086182, "sft_loss": 1.4286975860595703, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 12.377505883238616, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.26580289006233215, "logits/rejected": -0.15877142548561096, "logps/chosen": -1.3597075939178467, "logps/rejected": -1.8588300943374634, "loss": 1.0197, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3597075939178467, "rewards/margins": 0.499122679233551, "rewards/rejected": -1.8588300943374634, "sft_loss": 1.3698407411575317, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 10.005247094936804, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.16776493191719055, "logits/rejected": -0.11533866822719574, "logps/chosen": -1.3729407787322998, "logps/rejected": -1.8863117694854736, "loss": 1.0264, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3729407787322998, "rewards/margins": 0.5133708715438843, "rewards/rejected": -1.8863117694854736, "sft_loss": 1.424452781677246, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 8.47032457767384, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.19939279556274414, "logits/rejected": -0.08026957511901855, "logps/chosen": -1.2351583242416382, "logps/rejected": -1.9103492498397827, "loss": 0.9511, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2351583242416382, "rewards/margins": 0.675190806388855, "rewards/rejected": -1.9103492498397827, "sft_loss": 1.3065307140350342, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 10.78552905594794, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.3108990788459778, "logits/rejected": -0.14208605885505676, "logps/chosen": -1.284757137298584, "logps/rejected": -1.823754906654358, "loss": 0.9872, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.284757137298584, "rewards/margins": 0.5389977097511292, "rewards/rejected": -1.823754906654358, "sft_loss": 1.3961396217346191, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 11.737073842324726, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.21096226572990417, "logits/rejected": -0.0763673335313797, "logps/chosen": -1.3107333183288574, "logps/rejected": -1.7774174213409424, "loss": 0.983, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3107333183288574, "rewards/margins": 0.4666841924190521, "rewards/rejected": -1.7774174213409424, "sft_loss": 1.3360531330108643, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 11.707716272263704, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.27849727869033813, "logits/rejected": -0.12080128490924835, "logps/chosen": -1.269670844078064, "logps/rejected": -1.8024629354476929, "loss": 1.0003, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.269670844078064, "rewards/margins": 0.5327920913696289, "rewards/rejected": -1.8024629354476929, "sft_loss": 1.3441965579986572, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.11369408667087555, "eval_logits/rejected": 0.20460036396980286, "eval_logps/chosen": -1.398575782775879, "eval_logps/rejected": -1.8438172340393066, "eval_loss": 1.041297197341919, "eval_rewards/accuracies": 0.6186943650245667, "eval_rewards/chosen": -1.398575782775879, "eval_rewards/margins": 0.4452415108680725, "eval_rewards/rejected": -1.8438172340393066, "eval_runtime": 50.9806, "eval_samples_per_second": 26.383, "eval_sft_loss": 1.4026674032211304, "eval_steps_per_second": 6.61, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 7.79934160621283, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.16183073818683624, "logits/rejected": -0.1267414540052414, "logps/chosen": -1.3054921627044678, "logps/rejected": -1.7374639511108398, "loss": 1.042, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3054921627044678, "rewards/margins": 0.43197187781333923, "rewards/rejected": -1.7374639511108398, "sft_loss": 1.331822156906128, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 14.886700148689938, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.21894225478172302, "logits/rejected": -0.12526299059391022, "logps/chosen": -1.3163914680480957, "logps/rejected": -1.8971529006958008, "loss": 0.9634, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3163914680480957, "rewards/margins": 0.5807615518569946, "rewards/rejected": -1.8971529006958008, "sft_loss": 1.3194328546524048, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 5.547064945350949, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.23416312038898468, "logits/rejected": -0.13896510004997253, "logps/chosen": -1.2732197046279907, "logps/rejected": -1.8109039068222046, "loss": 0.974, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2732197046279907, "rewards/margins": 0.5376842021942139, "rewards/rejected": -1.8109039068222046, "sft_loss": 1.3302028179168701, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 11.434534387420785, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.2328489124774933, "logits/rejected": -0.15601322054862976, "logps/chosen": -1.2758675813674927, "logps/rejected": -1.9007642269134521, "loss": 0.991, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2758675813674927, "rewards/margins": 0.6248966455459595, "rewards/rejected": -1.9007642269134521, "sft_loss": 1.3788985013961792, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 28.682691199755357, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.16524925827980042, "logits/rejected": -0.023778708651661873, "logps/chosen": -1.376541018486023, "logps/rejected": -1.9343907833099365, "loss": 1.0012, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.376541018486023, "rewards/margins": 0.5578497052192688, "rewards/rejected": -1.9343907833099365, "sft_loss": 1.3966842889785767, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 10.160924466266978, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.3029932677745819, "logits/rejected": -0.1541920006275177, "logps/chosen": -1.3313233852386475, "logps/rejected": -1.7757488489151, "loss": 0.9956, "rewards/accuracies": 0.625, "rewards/chosen": -1.3313233852386475, "rewards/margins": 0.4444255828857422, "rewards/rejected": -1.7757488489151, "sft_loss": 1.331716775894165, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 7.942468487830706, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.2790893018245697, "logits/rejected": -0.16795600950717926, "logps/chosen": -1.3298932313919067, "logps/rejected": -1.826585054397583, "loss": 0.9702, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3298932313919067, "rewards/margins": 0.49669161438941956, "rewards/rejected": -1.826585054397583, "sft_loss": 1.3156096935272217, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 11.404925923136052, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.26472631096839905, "logits/rejected": -0.053306348621845245, "logps/chosen": -1.34926438331604, "logps/rejected": -1.8661677837371826, "loss": 1.0163, "rewards/accuracies": 0.59375, "rewards/chosen": -1.34926438331604, "rewards/margins": 0.5169033408164978, "rewards/rejected": -1.8661677837371826, "sft_loss": 1.3601570129394531, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 9.149071007317527, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.17115184664726257, "logits/rejected": -0.11456646770238876, "logps/chosen": -1.292288064956665, "logps/rejected": -1.7565841674804688, "loss": 1.0066, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.292288064956665, "rewards/margins": 0.46429625153541565, "rewards/rejected": -1.7565841674804688, "sft_loss": 1.303889274597168, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 14.640790871582416, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.2621290683746338, "logits/rejected": -0.1196102723479271, "logps/chosen": -1.3476142883300781, "logps/rejected": -1.876542091369629, "loss": 1.0228, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3476142883300781, "rewards/margins": 0.5289276838302612, "rewards/rejected": -1.876542091369629, "sft_loss": 1.3881253004074097, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 12.441149190766268, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.2256254404783249, "logits/rejected": -0.0006707877037115395, "logps/chosen": -1.341567873954773, "logps/rejected": -1.817436933517456, "loss": 1.0167, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.341567873954773, "rewards/margins": 0.4758690297603607, "rewards/rejected": -1.817436933517456, "sft_loss": 1.3392161130905151, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 16.34873624523049, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.19952070713043213, "logits/rejected": -0.12068512290716171, "logps/chosen": -1.3146910667419434, "logps/rejected": -1.8844077587127686, "loss": 1.0118, "rewards/accuracies": 0.625, "rewards/chosen": -1.3146910667419434, "rewards/margins": 0.5697168707847595, "rewards/rejected": -1.8844077587127686, "sft_loss": 1.3984930515289307, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 9.920009189503926, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.3461669683456421, "logits/rejected": -0.1222844123840332, "logps/chosen": -1.5000916719436646, "logps/rejected": -1.9649989604949951, "loss": 1.0746, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5000916719436646, "rewards/margins": 0.4649074673652649, "rewards/rejected": -1.9649989604949951, "sft_loss": 1.5144903659820557, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 7.557753564737677, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.21707001328468323, "logits/rejected": -0.0680532231926918, "logps/chosen": -1.266671061515808, "logps/rejected": -1.8083187341690063, "loss": 0.9414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.266671061515808, "rewards/margins": 0.5416474938392639, "rewards/rejected": -1.8083187341690063, "sft_loss": 1.2934260368347168, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 7.800923952205403, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.19317008554935455, "logits/rejected": -0.03749823570251465, "logps/chosen": -1.3289610147476196, "logps/rejected": -1.878650426864624, "loss": 0.9596, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3289610147476196, "rewards/margins": 0.5496894717216492, "rewards/rejected": -1.878650426864624, "sft_loss": 1.3256250619888306, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 15.100479281827543, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.2256535291671753, "logits/rejected": -0.05598093196749687, "logps/chosen": -1.4376709461212158, "logps/rejected": -2.014469623565674, "loss": 1.0343, "rewards/accuracies": 0.625, "rewards/chosen": -1.4376709461212158, "rewards/margins": 0.5767990350723267, "rewards/rejected": -2.014469623565674, "sft_loss": 1.4365990161895752, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 14.542188718953193, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.2840477526187897, "logits/rejected": -0.18323290348052979, "logps/chosen": -1.2720750570297241, "logps/rejected": -1.9221827983856201, "loss": 0.9368, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2720750570297241, "rewards/margins": 0.6501076221466064, "rewards/rejected": -1.9221827983856201, "sft_loss": 1.2888672351837158, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 12.497966768346453, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.21206767857074738, "logits/rejected": -0.06609787046909332, "logps/chosen": -1.3235752582550049, "logps/rejected": -1.765629768371582, "loss": 1.0268, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3235752582550049, "rewards/margins": 0.44205451011657715, "rewards/rejected": -1.765629768371582, "sft_loss": 1.3885878324508667, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 4.858483991361914, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.293332576751709, "logits/rejected": -0.09560652077198029, "logps/chosen": -1.422760009765625, "logps/rejected": -1.9979000091552734, "loss": 1.0268, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.422760009765625, "rewards/margins": 0.5751398801803589, "rewards/rejected": -1.9979000091552734, "sft_loss": 1.4550334215164185, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 9.142542353654257, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.23723764717578888, "logits/rejected": -0.1911575049161911, "logps/chosen": -1.3420722484588623, "logps/rejected": -1.9288721084594727, "loss": 0.986, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3420722484588623, "rewards/margins": 0.5867999196052551, "rewards/rejected": -1.9288721084594727, "sft_loss": 1.3467390537261963, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 11.931970720363225, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.2269028127193451, "logits/rejected": -0.08750694990158081, "logps/chosen": -1.360556960105896, "logps/rejected": -1.8920097351074219, "loss": 1.0135, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.360556960105896, "rewards/margins": 0.5314527750015259, "rewards/rejected": -1.8920097351074219, "sft_loss": 1.4528709650039673, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 6.881015565821513, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.3738090693950653, "logits/rejected": -0.11932375282049179, "logps/chosen": -1.277724027633667, "logps/rejected": -1.9794114828109741, "loss": 0.9526, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.277724027633667, "rewards/margins": 0.701687216758728, "rewards/rejected": -1.9794114828109741, "sft_loss": 1.326658844947815, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 11.822782857418908, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.12437667697668076, "logits/rejected": -0.001963780727237463, "logps/chosen": -1.3134911060333252, "logps/rejected": -1.7377033233642578, "loss": 0.9931, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3134911060333252, "rewards/margins": 0.4242123067378998, "rewards/rejected": -1.7377033233642578, "sft_loss": 1.3535457849502563, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 7.61898243948295, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.1452607810497284, "logits/rejected": -0.06799864023923874, "logps/chosen": -1.2906428575515747, "logps/rejected": -1.701115369796753, "loss": 0.9981, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2906428575515747, "rewards/margins": 0.41047239303588867, "rewards/rejected": -1.701115369796753, "sft_loss": 1.2899892330169678, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 7.697647234000022, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.18606527149677277, "logits/rejected": -0.041612427681684494, "logps/chosen": -1.3773971796035767, "logps/rejected": -1.968849539756775, "loss": 0.9795, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3773971796035767, "rewards/margins": 0.5914527177810669, "rewards/rejected": -1.968849539756775, "sft_loss": 1.3904547691345215, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 9.375056323214285, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.14773902297019958, "logits/rejected": -0.07479329407215118, "logps/chosen": -1.2629927396774292, "logps/rejected": -1.8598219156265259, "loss": 0.9804, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2629927396774292, "rewards/margins": 0.5968291759490967, "rewards/rejected": -1.8598219156265259, "sft_loss": 1.350647211074829, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 15.012322470417251, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.11493344604969025, "logits/rejected": -0.13788627088069916, "logps/chosen": -1.3153281211853027, "logps/rejected": -1.6620845794677734, "loss": 1.0355, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3153281211853027, "rewards/margins": 0.3467564284801483, "rewards/rejected": -1.6620845794677734, "sft_loss": 1.3026233911514282, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 12.17512336395831, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.16038838028907776, "logits/rejected": -0.12360197305679321, "logps/chosen": -1.3261483907699585, "logps/rejected": -1.7351865768432617, "loss": 0.9976, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3261483907699585, "rewards/margins": 0.40903812646865845, "rewards/rejected": -1.7351865768432617, "sft_loss": 1.319050908088684, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 4.8655149717993105, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.3073890507221222, "logits/rejected": -0.12378803640604019, "logps/chosen": -1.2660466432571411, "logps/rejected": -1.792517900466919, "loss": 0.9352, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2660466432571411, "rewards/margins": 0.5264711380004883, "rewards/rejected": -1.792517900466919, "sft_loss": 1.3165446519851685, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 7.823717651654698, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.15731899440288544, "logits/rejected": -0.17895503342151642, "logps/chosen": -1.3960539102554321, "logps/rejected": -1.7920535802841187, "loss": 1.0472, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3960539102554321, "rewards/margins": 0.3959996700286865, "rewards/rejected": -1.7920535802841187, "sft_loss": 1.3741943836212158, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 10.189124237163066, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.23294535279273987, "logits/rejected": -0.14471940696239471, "logps/chosen": -1.3473031520843506, "logps/rejected": -1.8221263885498047, "loss": 1.0107, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3473031520843506, "rewards/margins": 0.4748232960700989, "rewards/rejected": -1.8221263885498047, "sft_loss": 1.3688666820526123, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 12.1538866560791, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.205631285905838, "logits/rejected": -0.008093352429568768, "logps/chosen": -1.3273183107376099, "logps/rejected": -1.844382643699646, "loss": 1.0101, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3273183107376099, "rewards/margins": 0.5170644521713257, "rewards/rejected": -1.844382643699646, "sft_loss": 1.3940155506134033, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 8.240418561246397, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.27251097559928894, "logits/rejected": -0.10316653549671173, "logps/chosen": -1.3439300060272217, "logps/rejected": -1.923710823059082, "loss": 0.9782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3439300060272217, "rewards/margins": 0.5797806978225708, "rewards/rejected": -1.923710823059082, "sft_loss": 1.3883768320083618, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 8.628519080580295, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.13890625536441803, "logits/rejected": -0.10441068559885025, "logps/chosen": -1.3346256017684937, "logps/rejected": -1.661921739578247, "loss": 1.0487, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3346256017684937, "rewards/margins": 0.3272959589958191, "rewards/rejected": -1.661921739578247, "sft_loss": 1.3400561809539795, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 7.675909185183875, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.19847624003887177, "logits/rejected": -0.10837669670581818, "logps/chosen": -1.29621160030365, "logps/rejected": -1.9675801992416382, "loss": 0.9917, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.29621160030365, "rewards/margins": 0.6713687181472778, "rewards/rejected": -1.9675801992416382, "sft_loss": 1.3709373474121094, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 8.00604306816368, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.25982865691185, "logits/rejected": -0.041426219046115875, "logps/chosen": -1.4465641975402832, "logps/rejected": -2.004077911376953, "loss": 1.0053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4465641975402832, "rewards/margins": 0.5575135350227356, "rewards/rejected": -2.004077911376953, "sft_loss": 1.4928638935089111, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 6.391211316990939, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.1506432145833969, "logits/rejected": -0.06374558061361313, "logps/chosen": -1.3080568313598633, "logps/rejected": -1.7327619791030884, "loss": 0.9912, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3080568313598633, "rewards/margins": 0.42470502853393555, "rewards/rejected": -1.7327619791030884, "sft_loss": 1.3371647596359253, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 8.583832219329038, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.19303080439567566, "logits/rejected": -0.07137370109558105, "logps/chosen": -1.3770928382873535, "logps/rejected": -1.7824920415878296, "loss": 1.0351, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3770928382873535, "rewards/margins": 0.4053993225097656, "rewards/rejected": -1.7824920415878296, "sft_loss": 1.4261677265167236, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 9.393850330779484, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.3502636253833771, "logits/rejected": -0.14578810334205627, "logps/chosen": -1.2524877786636353, "logps/rejected": -1.7917158603668213, "loss": 0.9224, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2524877786636353, "rewards/margins": 0.5392279028892517, "rewards/rejected": -1.7917158603668213, "sft_loss": 1.2977898120880127, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 8.113996362616188, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.25449585914611816, "logits/rejected": 0.0005387455457821488, "logps/chosen": -1.3301738500595093, "logps/rejected": -2.1094958782196045, "loss": 0.9356, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3301738500595093, "rewards/margins": 0.7793217897415161, "rewards/rejected": -2.1094958782196045, "sft_loss": 1.334867238998413, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 7.448239479874202, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.15206363797187805, "logits/rejected": -0.10554593801498413, "logps/chosen": -1.4665147066116333, "logps/rejected": -1.9461534023284912, "loss": 1.0438, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4665147066116333, "rewards/margins": 0.4796389043331146, "rewards/rejected": -1.9461534023284912, "sft_loss": 1.4484292268753052, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 5.693368841094515, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.14152638614177704, "logits/rejected": -0.011924123391509056, "logps/chosen": -1.3331220149993896, "logps/rejected": -2.045759677886963, "loss": 0.9854, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3331220149993896, "rewards/margins": 0.712637722492218, "rewards/rejected": -2.045759677886963, "sft_loss": 1.3652178049087524, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 6.693209813625698, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.25429773330688477, "logits/rejected": -0.08107346296310425, "logps/chosen": -1.3584120273590088, "logps/rejected": -1.7874362468719482, "loss": 1.023, "rewards/accuracies": 0.625, "rewards/chosen": -1.3584120273590088, "rewards/margins": 0.42902421951293945, "rewards/rejected": -1.7874362468719482, "sft_loss": 1.411090612411499, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 7.726299637751278, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.18577158451080322, "logits/rejected": -0.11131541430950165, "logps/chosen": -1.2643746137619019, "logps/rejected": -1.72647225856781, "loss": 0.9986, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2643746137619019, "rewards/margins": 0.46209773421287537, "rewards/rejected": -1.72647225856781, "sft_loss": 1.2958446741104126, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 10.809253240200837, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.1767970770597458, "logits/rejected": -0.03920573741197586, "logps/chosen": -1.3743648529052734, "logps/rejected": -1.9125168323516846, "loss": 0.9875, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3743648529052734, "rewards/margins": 0.5381518602371216, "rewards/rejected": -1.9125168323516846, "sft_loss": 1.3702685832977295, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 8.151264446447978, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.20741066336631775, "logits/rejected": -0.03023364581167698, "logps/chosen": -1.3413336277008057, "logps/rejected": -1.9218580722808838, "loss": 0.9577, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3413336277008057, "rewards/margins": 0.5805243849754333, "rewards/rejected": -1.9218580722808838, "sft_loss": 1.3622300624847412, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 9.717039548252071, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.30634820461273193, "logits/rejected": -0.1974448412656784, "logps/chosen": -1.2954027652740479, "logps/rejected": -1.873197317123413, "loss": 0.9574, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2954027652740479, "rewards/margins": 0.5777946710586548, "rewards/rejected": -1.873197317123413, "sft_loss": 1.3467910289764404, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 16.560899956258325, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.25282543897628784, "logits/rejected": -0.13164076209068298, "logps/chosen": -1.3265902996063232, "logps/rejected": -1.8097198009490967, "loss": 1.0067, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3265902996063232, "rewards/margins": 0.4831293523311615, "rewards/rejected": -1.8097198009490967, "sft_loss": 1.3786664009094238, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 5.990699359674066, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.011542147025465965, "logits/rejected": 0.005524858832359314, "logps/chosen": -1.3121052980422974, "logps/rejected": -2.1332852840423584, "loss": 0.9434, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3121052980422974, "rewards/margins": 0.8211800456047058, "rewards/rejected": -2.1332852840423584, "sft_loss": 1.3157904148101807, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 12.619599450749567, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.09076647460460663, "logits/rejected": 0.011687842197716236, "logps/chosen": -1.3570753335952759, "logps/rejected": -1.9621295928955078, "loss": 0.9804, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3570753335952759, "rewards/margins": 0.6050541996955872, "rewards/rejected": -1.9621295928955078, "sft_loss": 1.368789792060852, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 7.831340047201271, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.23645658791065216, "logits/rejected": -0.062863290309906, "logps/chosen": -1.3370959758758545, "logps/rejected": -1.778796911239624, "loss": 1.0223, "rewards/accuracies": 0.625, "rewards/chosen": -1.3370959758758545, "rewards/margins": 0.44170108437538147, "rewards/rejected": -1.778796911239624, "sft_loss": 1.366168737411499, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 10.40011225408446, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.22707609832286835, "logits/rejected": -0.1244509220123291, "logps/chosen": -1.340163230895996, "logps/rejected": -1.9151283502578735, "loss": 0.9828, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.340163230895996, "rewards/margins": 0.5749651193618774, "rewards/rejected": -1.9151283502578735, "sft_loss": 1.402766466140747, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 8.544508926159043, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.2974518835544586, "logits/rejected": -0.10885222256183624, "logps/chosen": -1.3845881223678589, "logps/rejected": -1.844294786453247, "loss": 1.0019, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3845881223678589, "rewards/margins": 0.45970669388771057, "rewards/rejected": -1.844294786453247, "sft_loss": 1.3685612678527832, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 10.556707029804059, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.15490223467350006, "logits/rejected": -0.06371744722127914, "logps/chosen": -1.2789368629455566, "logps/rejected": -1.7651761770248413, "loss": 0.9691, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2789368629455566, "rewards/margins": 0.4862392544746399, "rewards/rejected": -1.7651761770248413, "sft_loss": 1.267244577407837, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 16.86907330565408, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.14799180626869202, "logits/rejected": -0.1888667643070221, "logps/chosen": -1.2474968433380127, "logps/rejected": -1.7585350275039673, "loss": 0.9819, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2474968433380127, "rewards/margins": 0.5110381841659546, "rewards/rejected": -1.7585350275039673, "sft_loss": 1.3227674961090088, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 8.141074083564165, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.20287397503852844, "logits/rejected": -0.09254992008209229, "logps/chosen": -1.3348429203033447, "logps/rejected": -1.7784709930419922, "loss": 1.0147, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3348429203033447, "rewards/margins": 0.4436280131340027, "rewards/rejected": -1.7784709930419922, "sft_loss": 1.3954198360443115, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 9.844583759460882, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.1399911642074585, "logits/rejected": 0.027894001454114914, "logps/chosen": -1.2917319536209106, "logps/rejected": -1.8091261386871338, "loss": 0.9802, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2917319536209106, "rewards/margins": 0.5173942446708679, "rewards/rejected": -1.8091261386871338, "sft_loss": 1.3570094108581543, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 10.753233019071727, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.22496263682842255, "logits/rejected": -0.18973574042320251, "logps/chosen": -1.2890931367874146, "logps/rejected": -1.7965633869171143, "loss": 0.9967, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2890931367874146, "rewards/margins": 0.5074703693389893, "rewards/rejected": -1.7965633869171143, "sft_loss": 1.371532678604126, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 14.825230069609155, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.2755940854549408, "logits/rejected": -0.07098913192749023, "logps/chosen": -1.267250418663025, "logps/rejected": -1.843408226966858, "loss": 0.9524, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.267250418663025, "rewards/margins": 0.5761579275131226, "rewards/rejected": -1.843408226966858, "sft_loss": 1.3246123790740967, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 8.70591824815608, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.16660761833190918, "logits/rejected": -0.049916822463274, "logps/chosen": -1.386541485786438, "logps/rejected": -1.805859923362732, "loss": 1.0579, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.386541485786438, "rewards/margins": 0.41931843757629395, "rewards/rejected": -1.805859923362732, "sft_loss": 1.4034945964813232, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 10.87677449432048, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.12635710835456848, "logits/rejected": -0.11135026067495346, "logps/chosen": -1.346790075302124, "logps/rejected": -1.7952775955200195, "loss": 1.0127, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.346790075302124, "rewards/margins": 0.4484872817993164, "rewards/rejected": -1.7952775955200195, "sft_loss": 1.376283884048462, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 12.222939159055558, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.2253093421459198, "logits/rejected": -0.10633256286382675, "logps/chosen": -1.342167615890503, "logps/rejected": -2.1339049339294434, "loss": 0.9421, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.342167615890503, "rewards/margins": 0.79173743724823, "rewards/rejected": -2.1339049339294434, "sft_loss": 1.3634456396102905, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 38.57193275680594, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.2967481017112732, "logits/rejected": -0.15350167453289032, "logps/chosen": -1.3245993852615356, "logps/rejected": -1.9612815380096436, "loss": 1.0, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3245993852615356, "rewards/margins": 0.6366821527481079, "rewards/rejected": -1.9612815380096436, "sft_loss": 1.37442147731781, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 10.18404680838618, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.18095073103904724, "logits/rejected": -0.12094295024871826, "logps/chosen": -1.290932297706604, "logps/rejected": -1.7183071374893188, "loss": 1.01, "rewards/accuracies": 0.65625, "rewards/chosen": -1.290932297706604, "rewards/margins": 0.42737483978271484, "rewards/rejected": -1.7183071374893188, "sft_loss": 1.324407935142517, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 9.147815224087866, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.21934518218040466, "logits/rejected": -0.09977545589208603, "logps/chosen": -1.3488126993179321, "logps/rejected": -1.9975097179412842, "loss": 0.974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3488126993179321, "rewards/margins": 0.6486971378326416, "rewards/rejected": -1.9975097179412842, "sft_loss": 1.367114782333374, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 10.109138141206513, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.27003321051597595, "logits/rejected": -0.04579021781682968, "logps/chosen": -1.4224035739898682, "logps/rejected": -1.8239049911499023, "loss": 1.0401, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4224035739898682, "rewards/margins": 0.40150150656700134, "rewards/rejected": -1.8239049911499023, "sft_loss": 1.4241381883621216, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 12.128324494377605, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.20196180045604706, "logits/rejected": -0.15698209404945374, "logps/chosen": -1.295867681503296, "logps/rejected": -1.7471435070037842, "loss": 1.0313, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.295867681503296, "rewards/margins": 0.45127612352371216, "rewards/rejected": -1.7471435070037842, "sft_loss": 1.3786756992340088, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 11.33136853984457, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.3168911635875702, "logits/rejected": -0.15934643149375916, "logps/chosen": -1.3818787336349487, "logps/rejected": -1.8020331859588623, "loss": 1.0248, "rewards/accuracies": 0.625, "rewards/chosen": -1.3818787336349487, "rewards/margins": 0.4201543927192688, "rewards/rejected": -1.8020331859588623, "sft_loss": 1.336635947227478, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 12.551932178201476, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.21031954884529114, "logits/rejected": -0.09710313379764557, "logps/chosen": -1.2512221336364746, "logps/rejected": -1.681840181350708, "loss": 0.9822, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2512221336364746, "rewards/margins": 0.4306180477142334, "rewards/rejected": -1.681840181350708, "sft_loss": 1.3028347492218018, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 6.543632163538888, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.12163994461297989, "logits/rejected": -0.00886031985282898, "logps/chosen": -1.377091646194458, "logps/rejected": -1.926664113998413, "loss": 1.0066, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.377091646194458, "rewards/margins": 0.5495725274085999, "rewards/rejected": -1.926664113998413, "sft_loss": 1.4350866079330444, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 11.995615591283471, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.22242912650108337, "logits/rejected": -0.008000977337360382, "logps/chosen": -1.4153683185577393, "logps/rejected": -1.983817458152771, "loss": 1.0265, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4153683185577393, "rewards/margins": 0.568449079990387, "rewards/rejected": -1.983817458152771, "sft_loss": 1.4314537048339844, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 13.761928785661214, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.14891259372234344, "logits/rejected": -0.1366450935602188, "logps/chosen": -1.354978322982788, "logps/rejected": -1.6971426010131836, "loss": 1.0707, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.354978322982788, "rewards/margins": 0.3421640992164612, "rewards/rejected": -1.6971426010131836, "sft_loss": 1.4287164211273193, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 9.61519890066985, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.2660152316093445, "logits/rejected": -0.21024027466773987, "logps/chosen": -1.3015344142913818, "logps/rejected": -1.741166353225708, "loss": 1.0001, "rewards/accuracies": 0.625, "rewards/chosen": -1.3015344142913818, "rewards/margins": 0.4396318793296814, "rewards/rejected": -1.741166353225708, "sft_loss": 1.3129467964172363, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 8.956481812838017, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.18123656511306763, "logits/rejected": -0.045481182634830475, "logps/chosen": -1.2856550216674805, "logps/rejected": -2.00970721244812, "loss": 0.95, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2856550216674805, "rewards/margins": 0.7240523099899292, "rewards/rejected": -2.00970721244812, "sft_loss": 1.3504688739776611, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 7.346514532996517, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.2186519205570221, "logits/rejected": -0.06879440695047379, "logps/chosen": -1.2897922992706299, "logps/rejected": -1.8777774572372437, "loss": 0.9716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2897922992706299, "rewards/margins": 0.5879851579666138, "rewards/rejected": -1.8777774572372437, "sft_loss": 1.3677794933319092, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 7.005036649911722, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.3067344129085541, "logits/rejected": -0.051812849938869476, "logps/chosen": -1.2479729652404785, "logps/rejected": -1.8473840951919556, "loss": 0.9648, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2479729652404785, "rewards/margins": 0.599411129951477, "rewards/rejected": -1.8473840951919556, "sft_loss": 1.3181512355804443, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 5.346289798835596, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.2633610963821411, "logits/rejected": -0.09879405796527863, "logps/chosen": -1.3504259586334229, "logps/rejected": -1.948297142982483, "loss": 0.9951, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3504259586334229, "rewards/margins": 0.5978710651397705, "rewards/rejected": -1.948297142982483, "sft_loss": 1.4504520893096924, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 6.752210598255841, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.18967227637767792, "logits/rejected": -0.1351352035999298, "logps/chosen": -1.3926022052764893, "logps/rejected": -1.875583291053772, "loss": 1.0273, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3926022052764893, "rewards/margins": 0.48298120498657227, "rewards/rejected": -1.875583291053772, "sft_loss": 1.3738386631011963, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 14.99272495049364, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.21403300762176514, "logits/rejected": -0.07583233714103699, "logps/chosen": -1.2375662326812744, "logps/rejected": -1.7883220911026, "loss": 0.9561, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2375662326812744, "rewards/margins": 0.5507559180259705, "rewards/rejected": -1.7883220911026, "sft_loss": 1.3005077838897705, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 9.373094728599023, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.20850276947021484, "logits/rejected": -0.10659182071685791, "logps/chosen": -1.3211060762405396, "logps/rejected": -2.0065088272094727, "loss": 0.9909, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3211060762405396, "rewards/margins": 0.6854029297828674, "rewards/rejected": -2.0065088272094727, "sft_loss": 1.4024105072021484, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.12686319649219513, "eval_logits/rejected": 0.21865931153297424, "eval_logps/chosen": -1.3990031480789185, "eval_logps/rejected": -1.843998670578003, "eval_loss": 1.0415899753570557, "eval_rewards/accuracies": 0.6157270073890686, "eval_rewards/chosen": -1.3990031480789185, "eval_rewards/margins": 0.44499555230140686, "eval_rewards/rejected": -1.843998670578003, "eval_runtime": 50.6079, "eval_samples_per_second": 26.577, "eval_sft_loss": 1.403135061264038, "eval_steps_per_second": 6.659, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 1.0402103091545567, "train_runtime": 33797.3752, "train_samples_per_second": 5.307, "train_steps_per_second": 0.166 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }