{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 6.1621459865713515, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06070180982351303, "logits/rejected": 0.14738903939723969, "logps/chosen": -1.716059684753418, "logps/rejected": -1.8892710208892822, "loss": 1.0429, "rewards/accuracies": 0.5625, "rewards/chosen": -1.716059684753418, "rewards/margins": 0.1732112467288971, "rewards/rejected": -1.8892710208892822, "semantic_entropy": 0.6584457159042358, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 9.137033794779027, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.0036977827548980713, "logits/rejected": 0.11409668624401093, "logps/chosen": -1.8028045892715454, "logps/rejected": -1.8464124202728271, "loss": 1.1233, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8028045892715454, "rewards/margins": 0.0436079278588295, "rewards/rejected": -1.8464124202728271, "semantic_entropy": 0.6394152641296387, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 9.22389226014171, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.029309600591659546, "logits/rejected": 0.06751412898302078, "logps/chosen": -1.6355518102645874, "logps/rejected": -1.7657592296600342, "loss": 1.1344, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6355518102645874, "rewards/margins": 0.13020756840705872, "rewards/rejected": -1.7657592296600342, "semantic_entropy": 0.6930069923400879, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 6.704632465419751, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.03660174086689949, "logits/rejected": 0.049360670149326324, "logps/chosen": -1.724509596824646, "logps/rejected": -1.8065202236175537, "loss": 1.145, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.724509596824646, "rewards/margins": 0.08201076835393906, "rewards/rejected": -1.8065202236175537, "semantic_entropy": 0.6685421466827393, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 13.950567091423647, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.04136265441775322, "logits/rejected": 0.044629622250795364, "logps/chosen": -1.869329810142517, "logps/rejected": -1.7786051034927368, "loss": 1.2712, "rewards/accuracies": 0.375, "rewards/chosen": -1.869329810142517, "rewards/margins": -0.09072484076023102, "rewards/rejected": -1.7786051034927368, "semantic_entropy": 0.6433960795402527, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 7.520127719976578, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.07225209474563599, "logits/rejected": 0.020951146259903908, "logps/chosen": -1.9089466333389282, "logps/rejected": -1.832271933555603, "loss": 1.1721, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.9089466333389282, "rewards/margins": -0.07667465507984161, "rewards/rejected": -1.832271933555603, "semantic_entropy": 0.6176777482032776, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 8.288075347283838, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.05746116489171982, "logits/rejected": 0.10160557925701141, "logps/chosen": -1.845741629600525, "logps/rejected": -1.9970605373382568, "loss": 1.1629, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.845741629600525, "rewards/margins": 0.1513189673423767, "rewards/rejected": -1.9970605373382568, "semantic_entropy": 0.6350187063217163, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 7.5458186716671465, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.05770735815167427, "logits/rejected": 0.23583391308784485, "logps/chosen": -1.880816102027893, "logps/rejected": -1.743043303489685, "loss": 1.2132, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.880816102027893, "rewards/margins": -0.1377728283405304, "rewards/rejected": -1.743043303489685, "semantic_entropy": 0.6431102752685547, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 12.928036650171752, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.049303993582725525, "logits/rejected": 0.25262051820755005, "logps/chosen": -1.837459921836853, "logps/rejected": -1.8713966608047485, "loss": 1.1798, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.837459921836853, "rewards/margins": 0.03393695876002312, "rewards/rejected": -1.8713966608047485, "semantic_entropy": 0.649166464805603, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 10.160669036683966, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.027670959010720253, "logits/rejected": 0.1239209994673729, "logps/chosen": -1.8993823528289795, "logps/rejected": -1.7789846658706665, "loss": 1.2256, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8993823528289795, "rewards/margins": -0.1203979030251503, "rewards/rejected": -1.7789846658706665, "semantic_entropy": 0.6335883140563965, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.047012193835533, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.10063391923904419, "logits/rejected": 0.12058229744434357, "logps/chosen": -1.8336282968521118, "logps/rejected": -1.8673959970474243, "loss": 1.1935, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8336282968521118, "rewards/margins": 0.03376791998744011, "rewards/rejected": -1.8673959970474243, "semantic_entropy": 0.6438094973564148, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.199053435790905, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08423934876918793, "logits/rejected": 0.10448728501796722, "logps/chosen": -1.789345145225525, "logps/rejected": -1.894176721572876, "loss": 1.1008, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.789345145225525, "rewards/margins": 0.10483156144618988, "rewards/rejected": -1.894176721572876, "semantic_entropy": 0.6360429525375366, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 5.878839162191842, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.04232923686504364, "logits/rejected": 0.10366680473089218, "logps/chosen": -1.6381199359893799, "logps/rejected": -1.7684608697891235, "loss": 1.0888, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6381199359893799, "rewards/margins": 0.13034099340438843, "rewards/rejected": -1.7684608697891235, "semantic_entropy": 0.6962206959724426, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 11.097796193507412, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.07627397030591965, "logits/rejected": 0.07312844693660736, "logps/chosen": -1.766296148300171, "logps/rejected": -1.8135309219360352, "loss": 1.1905, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.766296148300171, "rewards/margins": 0.047234609723091125, "rewards/rejected": -1.8135309219360352, "semantic_entropy": 0.6539437770843506, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 11.180823699806128, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.043935492634773254, "logits/rejected": 0.1390921175479889, "logps/chosen": -1.7772403955459595, "logps/rejected": -2.038160562515259, "loss": 1.0594, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7772403955459595, "rewards/margins": 0.2609199583530426, "rewards/rejected": -2.038160562515259, "semantic_entropy": 0.6338866353034973, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 7.729614400854603, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.009521784260869026, "logits/rejected": 0.11359156668186188, "logps/chosen": -1.7183939218521118, "logps/rejected": -1.7508172988891602, "loss": 1.1522, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7183939218521118, "rewards/margins": 0.0324234738945961, "rewards/rejected": -1.7508172988891602, "semantic_entropy": 0.6691663265228271, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 5.774164895526498, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.16618943214416504, "logits/rejected": 0.07412171363830566, "logps/chosen": -1.7912899255752563, "logps/rejected": -1.9684991836547852, "loss": 1.1099, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7912899255752563, "rewards/margins": 0.17720915377140045, "rewards/rejected": -1.9684991836547852, "semantic_entropy": 0.6479779481887817, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 13.994171190985876, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08484308421611786, "logits/rejected": 0.04691457375884056, "logps/chosen": -1.750454306602478, "logps/rejected": -1.7775003910064697, "loss": 1.1925, "rewards/accuracies": 0.46875, "rewards/chosen": -1.750454306602478, "rewards/margins": 0.027046024799346924, "rewards/rejected": -1.7775003910064697, "semantic_entropy": 0.668484091758728, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 5.179734454416302, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.0784115418791771, "logits/rejected": 0.06837181746959686, "logps/chosen": -1.805314302444458, "logps/rejected": -1.9120498895645142, "loss": 1.1394, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.805314302444458, "rewards/margins": 0.10673556476831436, "rewards/rejected": -1.9120498895645142, "semantic_entropy": 0.6409928202629089, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 6.303816361495141, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.0438729003071785, "logits/rejected": 0.01818550005555153, "logps/chosen": -1.6925382614135742, "logps/rejected": -1.8010832071304321, "loss": 1.104, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6925382614135742, "rewards/margins": 0.10854510962963104, "rewards/rejected": -1.8010832071304321, "semantic_entropy": 0.6733208298683167, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 8.250362407815302, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.04747066646814346, "logits/rejected": 0.07233314961194992, "logps/chosen": -1.6426517963409424, "logps/rejected": -1.8100935220718384, "loss": 1.0833, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6426517963409424, "rewards/margins": 0.16744166612625122, "rewards/rejected": -1.8100935220718384, "semantic_entropy": 0.6844531297683716, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.750093995633937, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.0031594126485288143, "logits/rejected": 0.09811054170131683, "logps/chosen": -1.6814390420913696, "logps/rejected": -1.7384357452392578, "loss": 1.1586, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6814390420913696, "rewards/margins": 0.05699686333537102, "rewards/rejected": -1.7384357452392578, "semantic_entropy": 0.6790895462036133, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 8.991359972391313, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.024249624460935593, "logits/rejected": 0.23187024891376495, "logps/chosen": -1.6709773540496826, "logps/rejected": -1.9569326639175415, "loss": 1.0366, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6709773540496826, "rewards/margins": 0.28595516085624695, "rewards/rejected": -1.9569326639175415, "semantic_entropy": 0.6562684178352356, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 5.808107193049869, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.07225940376520157, "logits/rejected": 0.10119612514972687, "logps/chosen": -1.7596435546875, "logps/rejected": -1.8809823989868164, "loss": 1.0991, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7596435546875, "rewards/margins": 0.12133894115686417, "rewards/rejected": -1.8809823989868164, "semantic_entropy": 0.6526800990104675, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 6.833917796547652, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.07721801102161407, "logits/rejected": 0.05250721424818039, "logps/chosen": -1.6813856363296509, "logps/rejected": -1.6302525997161865, "loss": 1.1922, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6813856363296509, "rewards/margins": -0.051132846623659134, "rewards/rejected": -1.6302525997161865, "semantic_entropy": 0.6917638778686523, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 8.414614080726308, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.02672005072236061, "logits/rejected": 0.15939494967460632, "logps/chosen": -1.7269341945648193, "logps/rejected": -1.847845435142517, "loss": 1.0565, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7269341945648193, "rewards/margins": 0.12091119587421417, "rewards/rejected": -1.847845435142517, "semantic_entropy": 0.6518223881721497, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 15.502747011489308, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.045921992510557175, "logits/rejected": 0.06921950727701187, "logps/chosen": -1.7797927856445312, "logps/rejected": -1.7936254739761353, "loss": 1.1683, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7797927856445312, "rewards/margins": 0.013832822442054749, "rewards/rejected": -1.7936254739761353, "semantic_entropy": 0.6467072367668152, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 10.934153827167037, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.029972439631819725, "logits/rejected": 0.13952571153640747, "logps/chosen": -1.7365680932998657, "logps/rejected": -1.8929036855697632, "loss": 1.0664, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7365680932998657, "rewards/margins": 0.15633563697338104, "rewards/rejected": -1.8929036855697632, "semantic_entropy": 0.6409581899642944, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 9.72307827733719, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.0310811810195446, "logits/rejected": 0.11993386596441269, "logps/chosen": -1.651533842086792, "logps/rejected": -1.7689011096954346, "loss": 1.0902, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.651533842086792, "rewards/margins": 0.1173669844865799, "rewards/rejected": -1.7689011096954346, "semantic_entropy": 0.6721662282943726, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 10.946409292553673, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.06234356015920639, "logits/rejected": 0.0975189134478569, "logps/chosen": -1.6039183139801025, "logps/rejected": -1.6026118993759155, "loss": 1.1726, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6039183139801025, "rewards/margins": -0.0013063341611996293, "rewards/rejected": -1.6026118993759155, "semantic_entropy": 0.7188035249710083, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 8.966979965135215, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.053622614592313766, "logits/rejected": -0.006728078238666058, "logps/chosen": -1.6239417791366577, "logps/rejected": -1.7132408618927002, "loss": 1.1056, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6239417791366577, "rewards/margins": 0.0892990455031395, "rewards/rejected": -1.7132408618927002, "semantic_entropy": 0.6902952790260315, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 7.433737051457635, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.1430046111345291, "logits/rejected": -0.0034906647633761168, "logps/chosen": -1.7533985376358032, "logps/rejected": -1.7312949895858765, "loss": 1.1909, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7533985376358032, "rewards/margins": -0.022103413939476013, "rewards/rejected": -1.7312949895858765, "semantic_entropy": 0.6651071310043335, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 8.024498031822322, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.05941913276910782, "logits/rejected": 0.11122976243495941, "logps/chosen": -1.5744872093200684, "logps/rejected": -1.7276694774627686, "loss": 1.1042, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5744872093200684, "rewards/margins": 0.1531822681427002, "rewards/rejected": -1.7276694774627686, "semantic_entropy": 0.6984173059463501, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 12.469922507489153, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.0928221344947815, "logits/rejected": -0.041338033974170685, "logps/chosen": -1.7328227758407593, "logps/rejected": -1.7776778936386108, "loss": 1.1493, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.7328227758407593, "rewards/margins": 0.04485485702753067, "rewards/rejected": -1.7776778936386108, "semantic_entropy": 0.6588774919509888, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 10.037232387665194, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.05204933136701584, "logits/rejected": 0.04736893251538277, "logps/chosen": -1.6078169345855713, "logps/rejected": -1.7068147659301758, "loss": 1.135, "rewards/accuracies": 0.5, "rewards/chosen": -1.6078169345855713, "rewards/margins": 0.09899773448705673, "rewards/rejected": -1.7068147659301758, "semantic_entropy": 0.6981975436210632, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 8.220031140397653, "learning_rate": 3.2085561497326203e-07, "logits/chosen": 0.015208420343697071, "logits/rejected": 0.01307359803467989, "logps/chosen": -1.633329153060913, "logps/rejected": -1.777130365371704, "loss": 1.1206, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.633329153060913, "rewards/margins": 0.14380115270614624, "rewards/rejected": -1.777130365371704, "semantic_entropy": 0.6907540559768677, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 8.44277458445029, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.12780144810676575, "logits/rejected": -0.040175847709178925, "logps/chosen": -1.5936999320983887, "logps/rejected": -1.6630268096923828, "loss": 1.1614, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.5936999320983887, "rewards/margins": 0.06932689249515533, "rewards/rejected": -1.6630268096923828, "semantic_entropy": 0.7057312726974487, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 8.214598410169415, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.06513194739818573, "logits/rejected": 0.05364646762609482, "logps/chosen": -1.6519644260406494, "logps/rejected": -1.7357133626937866, "loss": 1.0944, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6519644260406494, "rewards/margins": 0.08374904841184616, "rewards/rejected": -1.7357133626937866, "semantic_entropy": 0.6681785583496094, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 5.81056993834838, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.021021168678998947, "logits/rejected": 0.17753520607948303, "logps/chosen": -1.4594143629074097, "logps/rejected": -1.6115144491195679, "loss": 1.0913, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4594143629074097, "rewards/margins": 0.15210004150867462, "rewards/rejected": -1.6115144491195679, "semantic_entropy": 0.7464505434036255, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 12.873598411605691, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.06861015409231186, "logits/rejected": 0.07104112207889557, "logps/chosen": -1.5893186330795288, "logps/rejected": -1.601284384727478, "loss": 1.1435, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5893186330795288, "rewards/margins": 0.011965674348175526, "rewards/rejected": -1.601284384727478, "semantic_entropy": 0.7116156816482544, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 13.584024992116543, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.05508657544851303, "logits/rejected": 0.08913681656122208, "logps/chosen": -1.518781065940857, "logps/rejected": -1.5591602325439453, "loss": 1.1321, "rewards/accuracies": 0.5, "rewards/chosen": -1.518781065940857, "rewards/margins": 0.04037924110889435, "rewards/rejected": -1.5591602325439453, "semantic_entropy": 0.7272243499755859, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 12.81406377653034, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.135735422372818, "logits/rejected": 0.05884036421775818, "logps/chosen": -1.5534254312515259, "logps/rejected": -1.7356328964233398, "loss": 1.0671, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5534254312515259, "rewards/margins": 0.1822076290845871, "rewards/rejected": -1.7356328964233398, "semantic_entropy": 0.7095759510993958, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 7.357321246011275, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.18377165496349335, "logits/rejected": 0.060975439846515656, "logps/chosen": -1.5331767797470093, "logps/rejected": -1.627393126487732, "loss": 1.0754, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5331767797470093, "rewards/margins": 0.09421636164188385, "rewards/rejected": -1.627393126487732, "semantic_entropy": 0.7281553149223328, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 16.502921098288432, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.04538556560873985, "logits/rejected": 0.14347299933433533, "logps/chosen": -1.5174219608306885, "logps/rejected": -1.7298427820205688, "loss": 1.0482, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5174219608306885, "rewards/margins": 0.21242070198059082, "rewards/rejected": -1.7298427820205688, "semantic_entropy": 0.7120253443717957, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.370433761880968, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.0893501490354538, "logits/rejected": 0.08341099321842194, "logps/chosen": -1.4963688850402832, "logps/rejected": -1.64535391330719, "loss": 1.0625, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4963688850402832, "rewards/margins": 0.14898499846458435, "rewards/rejected": -1.64535391330719, "semantic_entropy": 0.7224361300468445, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 6.037012104027165, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.012658950872719288, "logits/rejected": 0.06327076256275177, "logps/chosen": -1.5707639455795288, "logps/rejected": -1.736476182937622, "loss": 1.0823, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5707639455795288, "rewards/margins": 0.16571208834648132, "rewards/rejected": -1.736476182937622, "semantic_entropy": 0.7066926956176758, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 10.864711575015095, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.02206835150718689, "logits/rejected": 0.1628737896680832, "logps/chosen": -1.5150396823883057, "logps/rejected": -1.692229986190796, "loss": 1.0396, "rewards/accuracies": 0.625, "rewards/chosen": -1.5150396823883057, "rewards/margins": 0.17719021439552307, "rewards/rejected": -1.692229986190796, "semantic_entropy": 0.7193215489387512, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 7.074774903340499, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.02384335733950138, "logits/rejected": 0.1055992841720581, "logps/chosen": -1.5265928506851196, "logps/rejected": -1.719167947769165, "loss": 1.0547, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5265928506851196, "rewards/margins": 0.19257517158985138, "rewards/rejected": -1.719167947769165, "semantic_entropy": 0.7045563459396362, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 7.662272971348944, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.036410488188266754, "logits/rejected": 0.1562315970659256, "logps/chosen": -1.6148598194122314, "logps/rejected": -1.7143230438232422, "loss": 1.0858, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6148598194122314, "rewards/margins": 0.09946312010288239, "rewards/rejected": -1.7143230438232422, "semantic_entropy": 0.6894456744194031, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 11.864125002258264, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.048241887241601944, "logits/rejected": 0.11782636493444443, "logps/chosen": -1.6349153518676758, "logps/rejected": -1.6976579427719116, "loss": 1.1406, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6349153518676758, "rewards/margins": 0.06274263560771942, "rewards/rejected": -1.6976579427719116, "semantic_entropy": 0.6956798434257507, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 7.796195965000263, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.020806463435292244, "logits/rejected": 0.12357542663812637, "logps/chosen": -1.4866034984588623, "logps/rejected": -1.6658039093017578, "loss": 1.056, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4866034984588623, "rewards/margins": 0.17920050024986267, "rewards/rejected": -1.6658039093017578, "semantic_entropy": 0.7214481830596924, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.563331790101729, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.21241986751556396, "logits/rejected": -0.10534496605396271, "logps/chosen": -1.653738260269165, "logps/rejected": -1.7724952697753906, "loss": 1.0457, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.653738260269165, "rewards/margins": 0.1187569871544838, "rewards/rejected": -1.7724952697753906, "semantic_entropy": 0.6744239330291748, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 16.160498491276236, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.07287711650133133, "logits/rejected": 0.009031775407493114, "logps/chosen": -1.675157904624939, "logps/rejected": -1.7961061000823975, "loss": 1.0988, "rewards/accuracies": 0.53125, "rewards/chosen": -1.675157904624939, "rewards/margins": 0.12094844877719879, "rewards/rejected": -1.7961061000823975, "semantic_entropy": 0.6546419262886047, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 5.654659760480054, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.08118149638175964, "logits/rejected": 0.0527944378554821, "logps/chosen": -1.5596070289611816, "logps/rejected": -1.6867377758026123, "loss": 1.0573, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5596070289611816, "rewards/margins": 0.12713071703910828, "rewards/rejected": -1.6867377758026123, "semantic_entropy": 0.6952215433120728, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 7.923835022429222, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.02389593794941902, "logits/rejected": 0.07080022990703583, "logps/chosen": -1.520407795906067, "logps/rejected": -1.754201889038086, "loss": 1.0602, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.520407795906067, "rewards/margins": 0.23379412293434143, "rewards/rejected": -1.754201889038086, "semantic_entropy": 0.7215244770050049, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 14.712512665656517, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.10531296581029892, "logits/rejected": 0.0524088516831398, "logps/chosen": -1.6230707168579102, "logps/rejected": -1.7438873052597046, "loss": 1.0823, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6230707168579102, "rewards/margins": 0.12081663310527802, "rewards/rejected": -1.7438873052597046, "semantic_entropy": 0.679011344909668, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.69243253514015, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.0718010812997818, "logits/rejected": 0.06791789084672928, "logps/chosen": -1.597745656967163, "logps/rejected": -1.7144542932510376, "loss": 1.106, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.597745656967163, "rewards/margins": 0.11670851707458496, "rewards/rejected": -1.7144542932510376, "semantic_entropy": 0.6957116723060608, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 7.7782211257781055, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.11950629949569702, "logits/rejected": 0.17797723412513733, "logps/chosen": -1.5724703073501587, "logps/rejected": -1.766248345375061, "loss": 1.0197, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5724703073501587, "rewards/margins": 0.1937781125307083, "rewards/rejected": -1.766248345375061, "semantic_entropy": 0.6906196475028992, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 11.79027110371697, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.04648825526237488, "logits/rejected": 0.012335294857621193, "logps/chosen": -1.5410258769989014, "logps/rejected": -1.663569688796997, "loss": 1.063, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5410258769989014, "rewards/margins": 0.12254378944635391, "rewards/rejected": -1.663569688796997, "semantic_entropy": 0.7017509341239929, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 8.835849573199795, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.07654620707035065, "logits/rejected": 0.09046686440706253, "logps/chosen": -1.5949294567108154, "logps/rejected": -1.71002197265625, "loss": 1.0829, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5949294567108154, "rewards/margins": 0.11509259045124054, "rewards/rejected": -1.71002197265625, "semantic_entropy": 0.6955488920211792, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 6.983366890384154, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.02877117693424225, "logits/rejected": 0.04091879725456238, "logps/chosen": -1.6932131052017212, "logps/rejected": -1.6878843307495117, "loss": 1.1315, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.6932131052017212, "rewards/margins": -0.005328828003257513, "rewards/rejected": -1.6878843307495117, "semantic_entropy": 0.6642959713935852, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 9.615112381569704, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.2014521062374115, "logits/rejected": -0.11008793115615845, "logps/chosen": -1.6740272045135498, "logps/rejected": -1.7929702997207642, "loss": 1.0864, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.6740272045135498, "rewards/margins": 0.11894307285547256, "rewards/rejected": -1.7929702997207642, "semantic_entropy": 0.6629990339279175, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 12.292656026710143, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.01841166988015175, "logits/rejected": 0.14304211735725403, "logps/chosen": -1.6711599826812744, "logps/rejected": -1.8580175638198853, "loss": 1.0557, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6711599826812744, "rewards/margins": 0.186857670545578, "rewards/rejected": -1.8580175638198853, "semantic_entropy": 0.6592516899108887, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 6.646670707784385, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.059432487934827805, "logits/rejected": 0.0704459697008133, "logps/chosen": -1.5968337059020996, "logps/rejected": -1.6530053615570068, "loss": 1.0909, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5968337059020996, "rewards/margins": 0.05617170408368111, "rewards/rejected": -1.6530053615570068, "semantic_entropy": 0.6955806612968445, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 9.457657582105599, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.10694797337055206, "logits/rejected": 0.012209171429276466, "logps/chosen": -1.6328935623168945, "logps/rejected": -1.9439910650253296, "loss": 1.0218, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6328935623168945, "rewards/margins": 0.3110976219177246, "rewards/rejected": -1.9439910650253296, "semantic_entropy": 0.6683646440505981, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 12.926849465905892, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.005584185477346182, "logits/rejected": 0.14117801189422607, "logps/chosen": -1.6041080951690674, "logps/rejected": -1.904754638671875, "loss": 0.995, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6041080951690674, "rewards/margins": 0.30064669251441956, "rewards/rejected": -1.904754638671875, "semantic_entropy": 0.6687676906585693, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 20.510474921917158, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.04617486149072647, "logits/rejected": 0.151122584939003, "logps/chosen": -1.688357949256897, "logps/rejected": -1.7622886896133423, "loss": 1.0828, "rewards/accuracies": 0.5, "rewards/chosen": -1.688357949256897, "rewards/margins": 0.07393099367618561, "rewards/rejected": -1.7622886896133423, "semantic_entropy": 0.66343754529953, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 17.51474938670172, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.02019418589770794, "logits/rejected": 0.12733808159828186, "logps/chosen": -1.763425588607788, "logps/rejected": -1.875178575515747, "loss": 1.1057, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.763425588607788, "rewards/margins": 0.11175310611724854, "rewards/rejected": -1.875178575515747, "semantic_entropy": 0.6318264007568359, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 11.458477109078752, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.050300829112529755, "logits/rejected": 0.08116074651479721, "logps/chosen": -1.6531829833984375, "logps/rejected": -1.8452335596084595, "loss": 1.0593, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6531829833984375, "rewards/margins": 0.1920507401227951, "rewards/rejected": -1.8452335596084595, "semantic_entropy": 0.6621311902999878, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 11.329624892763588, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.015283575281500816, "logits/rejected": 0.1078251451253891, "logps/chosen": -1.609297752380371, "logps/rejected": -1.7397960424423218, "loss": 1.0848, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.609297752380371, "rewards/margins": 0.13049837946891785, "rewards/rejected": -1.7397960424423218, "semantic_entropy": 0.6860819458961487, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 9.749373970268978, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.07602973282337189, "logits/rejected": 0.14425484836101532, "logps/chosen": -1.6898406744003296, "logps/rejected": -1.7698333263397217, "loss": 1.0882, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6898406744003296, "rewards/margins": 0.07999298721551895, "rewards/rejected": -1.7698333263397217, "semantic_entropy": 0.6580514311790466, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 9.515555465754796, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.05237163230776787, "logits/rejected": 0.025818347930908203, "logps/chosen": -1.6769046783447266, "logps/rejected": -1.8567724227905273, "loss": 1.0509, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6769046783447266, "rewards/margins": 0.17986764013767242, "rewards/rejected": -1.8567724227905273, "semantic_entropy": 0.6746851205825806, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 15.086499304257604, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.019944345578551292, "logits/rejected": 0.10378506034612656, "logps/chosen": -1.6250860691070557, "logps/rejected": -1.7133582830429077, "loss": 1.0988, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6250860691070557, "rewards/margins": 0.08827227354049683, "rewards/rejected": -1.7133582830429077, "semantic_entropy": 0.6908445358276367, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 13.830072743833517, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.03349882736802101, "logits/rejected": 0.05777007341384888, "logps/chosen": -1.6009029150009155, "logps/rejected": -1.7026888132095337, "loss": 1.1096, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6009029150009155, "rewards/margins": 0.10178569704294205, "rewards/rejected": -1.7026888132095337, "semantic_entropy": 0.7001045346260071, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 11.699656608660698, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.07194206863641739, "logits/rejected": 0.07688136398792267, "logps/chosen": -1.6611896753311157, "logps/rejected": -1.9261270761489868, "loss": 1.0219, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6611896753311157, "rewards/margins": 0.2649373412132263, "rewards/rejected": -1.9261270761489868, "semantic_entropy": 0.6614036560058594, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 8.432293708205574, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.03905266523361206, "logits/rejected": 0.04177533835172653, "logps/chosen": -1.8525673151016235, "logps/rejected": -2.073270082473755, "loss": 1.0041, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.8525673151016235, "rewards/margins": 0.22070245444774628, "rewards/rejected": -2.073270082473755, "semantic_entropy": 0.6032805442810059, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 6.558363447038307, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.004520825110375881, "logits/rejected": 0.06874732673168182, "logps/chosen": -1.8738794326782227, "logps/rejected": -1.971431016921997, "loss": 1.0507, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.8738794326782227, "rewards/margins": 0.09755153954029083, "rewards/rejected": -1.971431016921997, "semantic_entropy": 0.5936424732208252, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 11.9814690281164, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.056425292044878006, "logits/rejected": 0.21399247646331787, "logps/chosen": -1.9227275848388672, "logps/rejected": -2.138357400894165, "loss": 1.014, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9227275848388672, "rewards/margins": 0.21562990546226501, "rewards/rejected": -2.138357400894165, "semantic_entropy": 0.5690654516220093, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 11.659848810770669, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.04911988228559494, "logits/rejected": 0.10953982919454575, "logps/chosen": -1.8240041732788086, "logps/rejected": -1.974021553993225, "loss": 1.0404, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8240041732788086, "rewards/margins": 0.1500171720981598, "rewards/rejected": -1.974021553993225, "semantic_entropy": 0.608680009841919, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 17.654443437190544, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.07092205435037613, "logits/rejected": 0.15978960692882538, "logps/chosen": -1.84757399559021, "logps/rejected": -2.0060219764709473, "loss": 1.0119, "rewards/accuracies": 0.53125, "rewards/chosen": -1.84757399559021, "rewards/margins": 0.158447727560997, "rewards/rejected": -2.0060219764709473, "semantic_entropy": 0.5995103120803833, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.27376288175582886, "eval_logits/rejected": 0.35996997356414795, "eval_logps/chosen": -1.7929844856262207, "eval_logps/rejected": -2.0279812812805176, "eval_loss": 1.013211965560913, "eval_rewards/accuracies": 0.5660237669944763, "eval_rewards/chosen": -1.7929844856262207, "eval_rewards/margins": 0.23499667644500732, "eval_rewards/rejected": -2.0279812812805176, "eval_runtime": 35.6332, "eval_samples_per_second": 37.746, "eval_semantic_entropy": 0.6131907105445862, "eval_steps_per_second": 9.457, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 10.075168755240373, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.0024316341150552034, "logits/rejected": 0.0858984887599945, "logps/chosen": -1.8473918437957764, "logps/rejected": -2.068129062652588, "loss": 1.0621, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8473918437957764, "rewards/margins": 0.22073736786842346, "rewards/rejected": -2.068129062652588, "semantic_entropy": 0.6077993512153625, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 18.950538852606062, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.04571036621928215, "logits/rejected": 0.16462978720664978, "logps/chosen": -1.7759612798690796, "logps/rejected": -1.9969593286514282, "loss": 1.012, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.7759612798690796, "rewards/margins": 0.22099807858467102, "rewards/rejected": -1.9969593286514282, "semantic_entropy": 0.6101277470588684, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 12.639518148578636, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.04078054428100586, "logits/rejected": 0.08592768013477325, "logps/chosen": -1.8478095531463623, "logps/rejected": -2.025573492050171, "loss": 1.0192, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.8478095531463623, "rewards/margins": 0.17776378989219666, "rewards/rejected": -2.025573492050171, "semantic_entropy": 0.5999530553817749, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 10.190575297401363, "learning_rate": 7.486631016042781e-07, "logits/chosen": 0.0002573668898548931, "logits/rejected": 0.18812724947929382, "logps/chosen": -1.6988885402679443, "logps/rejected": -1.8715341091156006, "loss": 1.0463, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6988885402679443, "rewards/margins": 0.17264559864997864, "rewards/rejected": -1.8715341091156006, "semantic_entropy": 0.6426397562026978, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 11.953799492318076, "learning_rate": 7.575757575757575e-07, "logits/chosen": 0.0010619193781167269, "logits/rejected": 0.19794592261314392, "logps/chosen": -1.7909456491470337, "logps/rejected": -2.094650983810425, "loss": 0.9602, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7909456491470337, "rewards/margins": 0.30370545387268066, "rewards/rejected": -2.094650983810425, "semantic_entropy": 0.6076642274856567, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 10.263106298016128, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.02113468013703823, "logits/rejected": 0.18052729964256287, "logps/chosen": -1.803195595741272, "logps/rejected": -2.2266640663146973, "loss": 0.9559, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.803195595741272, "rewards/margins": 0.4234686493873596, "rewards/rejected": -2.2266640663146973, "semantic_entropy": 0.6058921813964844, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 23.770582060225383, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.07859322428703308, "logits/rejected": 0.17100855708122253, "logps/chosen": -1.8036648035049438, "logps/rejected": -1.972700834274292, "loss": 1.0131, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8036648035049438, "rewards/margins": 0.1690361052751541, "rewards/rejected": -1.972700834274292, "semantic_entropy": 0.6105883717536926, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 8.942531726492792, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.027078593149781227, "logits/rejected": 0.11831989139318466, "logps/chosen": -1.8217464685440063, "logps/rejected": -2.0932116508483887, "loss": 0.9838, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8217464685440063, "rewards/margins": 0.2714650630950928, "rewards/rejected": -2.0932116508483887, "semantic_entropy": 0.5999422669410706, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 12.264776032319165, "learning_rate": 7.932263814616755e-07, "logits/chosen": 0.01442508865147829, "logits/rejected": 0.12444069236516953, "logps/chosen": -1.918039083480835, "logps/rejected": -2.2938976287841797, "loss": 0.9688, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.918039083480835, "rewards/margins": 0.37585827708244324, "rewards/rejected": -2.2938976287841797, "semantic_entropy": 0.5600379705429077, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 20.147606211927062, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.08424471318721771, "logits/rejected": 0.2129439115524292, "logps/chosen": -1.9648675918579102, "logps/rejected": -2.2125535011291504, "loss": 0.9685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9648675918579102, "rewards/margins": 0.24768579006195068, "rewards/rejected": -2.2125535011291504, "semantic_entropy": 0.555503249168396, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 10.39177189011024, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.1032770648598671, "logits/rejected": 0.18853013217449188, "logps/chosen": -1.8158471584320068, "logps/rejected": -2.1788058280944824, "loss": 0.9408, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.8158471584320068, "rewards/margins": 0.3629588484764099, "rewards/rejected": -2.1788058280944824, "semantic_entropy": 0.5897886157035828, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 9.700606050871391, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.019435208290815353, "logits/rejected": 0.10268989950418472, "logps/chosen": -1.9720776081085205, "logps/rejected": -2.17942476272583, "loss": 0.9789, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9720776081085205, "rewards/margins": 0.2073473036289215, "rewards/rejected": -2.17942476272583, "semantic_entropy": 0.5492539405822754, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 13.542184802712866, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.25124862790107727, "logits/rejected": 0.2735592722892761, "logps/chosen": -2.041325807571411, "logps/rejected": -2.2813258171081543, "loss": 0.9751, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.041325807571411, "rewards/margins": 0.24000012874603271, "rewards/rejected": -2.2813258171081543, "semantic_entropy": 0.5254560112953186, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 9.43369236776717, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.28713709115982056, "logits/rejected": 0.24178346991539001, "logps/chosen": -2.011660575866699, "logps/rejected": -2.2471611499786377, "loss": 0.9972, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.011660575866699, "rewards/margins": 0.2355005294084549, "rewards/rejected": -2.2471611499786377, "semantic_entropy": 0.5238825678825378, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 8.440890172534203, "learning_rate": 8.467023172905525e-07, "logits/chosen": 0.06835935264825821, "logits/rejected": 0.2116996943950653, "logps/chosen": -1.9710376262664795, "logps/rejected": -2.4928886890411377, "loss": 0.8891, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.9710376262664795, "rewards/margins": 0.5218510031700134, "rewards/rejected": -2.4928886890411377, "semantic_entropy": 0.5133862495422363, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 13.271352574157751, "learning_rate": 8.55614973262032e-07, "logits/chosen": 0.11900024116039276, "logits/rejected": 0.3160308599472046, "logps/chosen": -2.046262502670288, "logps/rejected": -2.303492307662964, "loss": 0.9389, "rewards/accuracies": 0.59375, "rewards/chosen": -2.046262502670288, "rewards/margins": 0.25722989439964294, "rewards/rejected": -2.303492307662964, "semantic_entropy": 0.5193344354629517, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 17.40327979933669, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.11675455421209335, "logits/rejected": 0.16688722372055054, "logps/chosen": -2.2786717414855957, "logps/rejected": -2.450490951538086, "loss": 0.9805, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.2786717414855957, "rewards/margins": 0.17181938886642456, "rewards/rejected": -2.450490951538086, "semantic_entropy": 0.45640721917152405, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 10.5637127371766, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.14526286721229553, "logits/rejected": 0.21295936405658722, "logps/chosen": -2.2720694541931152, "logps/rejected": -2.437973737716675, "loss": 0.9629, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2720694541931152, "rewards/margins": 0.16590480506420135, "rewards/rejected": -2.437973737716675, "semantic_entropy": 0.4487149715423584, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 13.968933074481987, "learning_rate": 8.823529411764705e-07, "logits/chosen": 0.10887887328863144, "logits/rejected": 0.1369287371635437, "logps/chosen": -2.321854829788208, "logps/rejected": -2.471646785736084, "loss": 0.9461, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.321854829788208, "rewards/margins": 0.14979204535484314, "rewards/rejected": -2.471646785736084, "semantic_entropy": 0.42514246702194214, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 15.91869715362306, "learning_rate": 8.912655971479501e-07, "logits/chosen": 0.11035114526748657, "logits/rejected": 0.19290763139724731, "logps/chosen": -2.324972152709961, "logps/rejected": -2.60076642036438, "loss": 0.8935, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.324972152709961, "rewards/margins": 0.2757939100265503, "rewards/rejected": -2.60076642036438, "semantic_entropy": 0.4203091263771057, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 17.82851785747078, "learning_rate": 9.001782531194295e-07, "logits/chosen": 0.10254337638616562, "logits/rejected": 0.23869287967681885, "logps/chosen": -2.434830904006958, "logps/rejected": -2.6683614253997803, "loss": 0.88, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.434830904006958, "rewards/margins": 0.23353052139282227, "rewards/rejected": -2.6683614253997803, "semantic_entropy": 0.38676854968070984, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 9.222919526301315, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.2949961721897125, "logits/rejected": 0.34191757440567017, "logps/chosen": -2.5707197189331055, "logps/rejected": -2.9306082725524902, "loss": 0.8353, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.5707197189331055, "rewards/margins": 0.35988861322402954, "rewards/rejected": -2.9306082725524902, "semantic_entropy": 0.3533479571342468, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 13.125321055251398, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.26439735293388367, "logits/rejected": 0.3643534779548645, "logps/chosen": -2.6091151237487793, "logps/rejected": -2.9702847003936768, "loss": 0.8275, "rewards/accuracies": 0.625, "rewards/chosen": -2.6091151237487793, "rewards/margins": 0.3611697256565094, "rewards/rejected": -2.9702847003936768, "semantic_entropy": 0.3468519449234009, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 13.837098088359745, "learning_rate": 9.26916221033868e-07, "logits/chosen": 0.1988576203584671, "logits/rejected": 0.3169275224208832, "logps/chosen": -2.997946262359619, "logps/rejected": -3.395313262939453, "loss": 0.824, "rewards/accuracies": 0.625, "rewards/chosen": -2.997946262359619, "rewards/margins": 0.39736661314964294, "rewards/rejected": -3.395313262939453, "semantic_entropy": 0.2663891315460205, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 23.212683763316665, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.362657368183136, "logits/rejected": 0.431486040353775, "logps/chosen": -3.46795654296875, "logps/rejected": -3.9383537769317627, "loss": 0.7851, "rewards/accuracies": 0.65625, "rewards/chosen": -3.46795654296875, "rewards/margins": 0.4703969359397888, "rewards/rejected": -3.9383537769317627, "semantic_entropy": 0.19957469403743744, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 19.427149448565736, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.3255121111869812, "logits/rejected": 0.39444050192832947, "logps/chosen": -3.581265926361084, "logps/rejected": -4.079986095428467, "loss": 0.7894, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.581265926361084, "rewards/margins": 0.4987207055091858, "rewards/rejected": -4.079986095428467, "semantic_entropy": 0.19464334845542908, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 17.35259546926179, "learning_rate": 9.536541889483066e-07, "logits/chosen": 0.20121872425079346, "logits/rejected": 0.4174925684928894, "logps/chosen": -3.966810941696167, "logps/rejected": -4.486771583557129, "loss": 0.721, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.966810941696167, "rewards/margins": 0.5199612379074097, "rewards/rejected": -4.486771583557129, "semantic_entropy": 0.14531609416007996, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 26.596890402490573, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.2888971269130707, "logits/rejected": 0.3459900915622711, "logps/chosen": -4.495975971221924, "logps/rejected": -4.9110822677612305, "loss": 0.7492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.495975971221924, "rewards/margins": 0.4151054322719574, "rewards/rejected": -4.9110822677612305, "semantic_entropy": 0.09809108078479767, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 18.02680952570327, "learning_rate": 9.714795008912655e-07, "logits/chosen": 0.20895743370056152, "logits/rejected": 0.3623776435852051, "logps/chosen": -4.741501808166504, "logps/rejected": -5.503144264221191, "loss": 0.6348, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.741501808166504, "rewards/margins": 0.7616419196128845, "rewards/rejected": -5.503144264221191, "semantic_entropy": 0.08251913636922836, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 20.727150313169535, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.30375415086746216, "logits/rejected": 0.3442818522453308, "logps/chosen": -5.456840991973877, "logps/rejected": -6.024032115936279, "loss": 0.7065, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.456840991973877, "rewards/margins": 0.5671912431716919, "rewards/rejected": -6.024032115936279, "semantic_entropy": 0.06045646220445633, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 27.134124904666713, "learning_rate": 9.893048128342244e-07, "logits/chosen": 0.28694844245910645, "logits/rejected": 0.3676696717739105, "logps/chosen": -5.522095680236816, "logps/rejected": -5.709345817565918, "loss": 0.8281, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -5.522095680236816, "rewards/margins": 0.18725113570690155, "rewards/rejected": -5.709345817565918, "semantic_entropy": 0.05345703288912773, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 31.40474528083335, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.3329199552536011, "logits/rejected": 0.3428536355495453, "logps/chosen": -4.907380104064941, "logps/rejected": -5.42898416519165, "loss": 0.684, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.907380104064941, "rewards/margins": 0.5216037034988403, "rewards/rejected": -5.42898416519165, "semantic_entropy": 0.08699695765972137, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 15.731040510232038, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.32874903082847595, "logits/rejected": 0.37473997473716736, "logps/chosen": -4.960855960845947, "logps/rejected": -5.458104610443115, "loss": 0.6559, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.960855960845947, "rewards/margins": 0.49724894762039185, "rewards/rejected": -5.458104610443115, "semantic_entropy": 0.07233523577451706, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 26.591918700910675, "learning_rate": 9.999921413906797e-07, "logits/chosen": 0.2645338177680969, "logits/rejected": 0.4236833453178406, "logps/chosen": -4.993116855621338, "logps/rejected": -5.463040828704834, "loss": 0.6672, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.993116855621338, "rewards/margins": 0.4699248671531677, "rewards/rejected": -5.463040828704834, "semantic_entropy": 0.06868621706962585, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 17.123203800580065, "learning_rate": 9.999809841765644e-07, "logits/chosen": 0.26640480756759644, "logits/rejected": 0.3024447560310364, "logps/chosen": -4.769078731536865, "logps/rejected": -5.275615692138672, "loss": 0.6851, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.769078731536865, "rewards/margins": 0.5065367817878723, "rewards/rejected": -5.275615692138672, "semantic_entropy": 0.07678450644016266, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 16.862219340555285, "learning_rate": 9.999649761447477e-07, "logits/chosen": 0.2403547316789627, "logits/rejected": 0.35913315415382385, "logps/chosen": -4.844521999359131, "logps/rejected": -5.310414791107178, "loss": 0.6817, "rewards/accuracies": 0.65625, "rewards/chosen": -4.844521999359131, "rewards/margins": 0.4658929407596588, "rewards/rejected": -5.310414791107178, "semantic_entropy": 0.07107989490032196, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 20.48877444397547, "learning_rate": 9.999441174505398e-07, "logits/chosen": 0.2463439702987671, "logits/rejected": 0.3041590750217438, "logps/chosen": -5.344332218170166, "logps/rejected": -5.5950727462768555, "loss": 0.7775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5.344332218170166, "rewards/margins": 0.250741183757782, "rewards/rejected": -5.5950727462768555, "semantic_entropy": 0.05236155912280083, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 33.45307601229143, "learning_rate": 9.999184082963116e-07, "logits/chosen": 0.2851886749267578, "logits/rejected": 0.37152618169784546, "logps/chosen": -5.112926006317139, "logps/rejected": -5.441379547119141, "loss": 0.7305, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -5.112926006317139, "rewards/margins": 0.3284529149532318, "rewards/rejected": -5.441379547119141, "semantic_entropy": 0.05966230109333992, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 28.772414597800683, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.3128214478492737, "logits/rejected": 0.4057738184928894, "logps/chosen": -5.047120094299316, "logps/rejected": -5.475900173187256, "loss": 0.6587, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.047120094299316, "rewards/margins": 0.4287797510623932, "rewards/rejected": -5.475900173187256, "semantic_entropy": 0.056463856250047684, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 21.595817825496415, "learning_rate": 9.99852439652573e-07, "logits/chosen": 0.27063342928886414, "logits/rejected": 0.3820621371269226, "logps/chosen": -5.453131198883057, "logps/rejected": -5.779126167297363, "loss": 0.6993, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -5.453131198883057, "rewards/margins": 0.32599514722824097, "rewards/rejected": -5.779126167297363, "semantic_entropy": 0.04167807847261429, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 26.050724529880945, "learning_rate": 9.998121808030904e-07, "logits/chosen": 0.24278345704078674, "logits/rejected": 0.3093962073326111, "logps/chosen": -5.513918876647949, "logps/rejected": -5.7660112380981445, "loss": 0.8, "rewards/accuracies": 0.59375, "rewards/chosen": -5.513918876647949, "rewards/margins": 0.25209134817123413, "rewards/rejected": -5.7660112380981445, "semantic_entropy": 0.04349964112043381, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 45.051043777132236, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.31496500968933105, "logits/rejected": 0.47544389963150024, "logps/chosen": -5.2426862716674805, "logps/rejected": -5.6988115310668945, "loss": 0.6803, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.2426862716674805, "rewards/margins": 0.4561251699924469, "rewards/rejected": -5.6988115310668945, "semantic_entropy": 0.05658901482820511, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 23.856980522226188, "learning_rate": 9.99717116001853e-07, "logits/chosen": 0.32243964076042175, "logits/rejected": 0.39280059933662415, "logps/chosen": -5.77672815322876, "logps/rejected": -6.442985534667969, "loss": 0.5973, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.77672815322876, "rewards/margins": 0.6662576794624329, "rewards/rejected": -6.442985534667969, "semantic_entropy": 0.03704090788960457, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 15.346139192682116, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.4094429016113281, "logits/rejected": 0.44762665033340454, "logps/chosen": -6.212830543518066, "logps/rejected": -6.7759881019592285, "loss": 0.6226, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.212830543518066, "rewards/margins": 0.5631579160690308, "rewards/rejected": -6.7759881019592285, "semantic_entropy": 0.03154679387807846, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 18.787286491671612, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.41264209151268005, "logits/rejected": 0.5199421048164368, "logps/chosen": -6.266766548156738, "logps/rejected": -6.7597222328186035, "loss": 0.6553, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -6.266766548156738, "rewards/margins": 0.49295586347579956, "rewards/rejected": -6.7597222328186035, "semantic_entropy": 0.03314858675003052, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 23.697424686130283, "learning_rate": 9.995381583144996e-07, "logits/chosen": 0.3286336362361908, "logits/rejected": 0.416952908039093, "logps/chosen": -6.489420413970947, "logps/rejected": -7.091695308685303, "loss": 0.6025, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.489420413970947, "rewards/margins": 0.6022747755050659, "rewards/rejected": -7.091695308685303, "semantic_entropy": 0.019754167646169662, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 23.346401547821337, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.39607900381088257, "logits/rejected": 0.5640990138053894, "logps/chosen": -6.5967607498168945, "logps/rejected": -7.059272766113281, "loss": 0.674, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -6.5967607498168945, "rewards/margins": 0.4625115990638733, "rewards/rejected": -7.059272766113281, "semantic_entropy": 0.023461516946554184, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 28.41310343878819, "learning_rate": 9.993946196179912e-07, "logits/chosen": 0.37280526757240295, "logits/rejected": 0.5077700018882751, "logps/chosen": -6.618893623352051, "logps/rejected": -6.993128776550293, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": -6.618893623352051, "rewards/margins": 0.3742350935935974, "rewards/rejected": -6.993128776550293, "semantic_entropy": 0.016319947317242622, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 17.075034812283786, "learning_rate": 9.993155822166455e-07, "logits/chosen": 0.4683307707309723, "logits/rejected": 0.5001148581504822, "logps/chosen": -6.166420936584473, "logps/rejected": -6.613181114196777, "loss": 0.6588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.166420936584473, "rewards/margins": 0.4467601776123047, "rewards/rejected": -6.613181114196777, "semantic_entropy": 0.023232873529195786, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 22.115212411816152, "learning_rate": 9.992317004533313e-07, "logits/chosen": 0.49777716398239136, "logits/rejected": 0.5633991360664368, "logps/chosen": -6.14475154876709, "logps/rejected": -6.610726833343506, "loss": 0.6416, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.14475154876709, "rewards/margins": 0.4659750461578369, "rewards/rejected": -6.610726833343506, "semantic_entropy": 0.023257287219166756, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 20.090310906010423, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.4952174127101898, "logits/rejected": 0.5142993927001953, "logps/chosen": -5.922252655029297, "logps/rejected": -6.414206504821777, "loss": 0.6695, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.922252655029297, "rewards/margins": 0.49195390939712524, "rewards/rejected": -6.414206504821777, "semantic_entropy": 0.027981286868453026, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 19.79614358859752, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.5208943486213684, "logits/rejected": 0.5800861120223999, "logps/chosen": -6.255806922912598, "logps/rejected": -6.602316856384277, "loss": 0.697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.255806922912598, "rewards/margins": 0.34651073813438416, "rewards/rejected": -6.602316856384277, "semantic_entropy": 0.02483288012444973, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 19.579221986465477, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.5133857727050781, "logits/rejected": 0.58699631690979, "logps/chosen": -6.382502555847168, "logps/rejected": -6.7586350440979, "loss": 0.6752, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.382502555847168, "rewards/margins": 0.3761317729949951, "rewards/rejected": -6.7586350440979, "semantic_entropy": 0.019113317131996155, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 19.24827550379443, "learning_rate": 9.988477467616445e-07, "logits/chosen": 0.5319421291351318, "logits/rejected": 0.6193274259567261, "logps/chosen": -6.479439735412598, "logps/rejected": -6.937412261962891, "loss": 0.5944, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.479439735412598, "rewards/margins": 0.4579733908176422, "rewards/rejected": -6.937412261962891, "semantic_entropy": 0.016540968790650368, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 20.36938516339038, "learning_rate": 9.987396563355205e-07, "logits/chosen": 0.5054916143417358, "logits/rejected": 0.5468543171882629, "logps/chosen": -6.396731376647949, "logps/rejected": -6.9033098220825195, "loss": 0.5913, "rewards/accuracies": 0.71875, "rewards/chosen": -6.396731376647949, "rewards/margins": 0.5065786242485046, "rewards/rejected": -6.9033098220825195, "semantic_entropy": 0.019933702424168587, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 25.694663813660224, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.4442412257194519, "logits/rejected": 0.5351762771606445, "logps/chosen": -6.232950687408447, "logps/rejected": -6.665299892425537, "loss": 0.6833, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.232950687408447, "rewards/margins": 0.4323497414588928, "rewards/rejected": -6.665299892425537, "semantic_entropy": 0.02159653976559639, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 26.644324005748285, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.49901971220970154, "logits/rejected": 0.5869329571723938, "logps/chosen": -6.33712911605835, "logps/rejected": -6.925673007965088, "loss": 0.5808, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.33712911605835, "rewards/margins": 0.588544487953186, "rewards/rejected": -6.925673007965088, "semantic_entropy": 0.01862289011478424, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 20.404114370621034, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.5496017336845398, "logits/rejected": 0.5704872012138367, "logps/chosen": -6.48660135269165, "logps/rejected": -6.933077812194824, "loss": 0.6603, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -6.48660135269165, "rewards/margins": 0.44647669792175293, "rewards/rejected": -6.933077812194824, "semantic_entropy": 0.017888184636831284, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 20.958217633474742, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.4869377613067627, "logits/rejected": 0.5502743721008301, "logps/chosen": -6.533112525939941, "logps/rejected": -6.993691921234131, "loss": 0.6216, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.533112525939941, "rewards/margins": 0.4605790674686432, "rewards/rejected": -6.993691921234131, "semantic_entropy": 0.01869816519320011, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 28.100366278994386, "learning_rate": 9.981266452066553e-07, "logits/chosen": 0.38128662109375, "logits/rejected": 0.45251068472862244, "logps/chosen": -6.801139831542969, "logps/rejected": -7.043248176574707, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -6.801139831542969, "rewards/margins": 0.2421083003282547, "rewards/rejected": -7.043248176574707, "semantic_entropy": 0.012132355943322182, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 20.82013187037515, "learning_rate": 9.979895395076608e-07, "logits/chosen": 0.30956459045410156, "logits/rejected": 0.44323819875717163, "logps/chosen": -6.558831691741943, "logps/rejected": -7.164409637451172, "loss": 0.575, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.558831691741943, "rewards/margins": 0.6055777668952942, "rewards/rejected": -7.164409637451172, "semantic_entropy": 0.01754785142838955, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 21.30045872363481, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.3447558283805847, "logits/rejected": 0.4274185299873352, "logps/chosen": -6.733677864074707, "logps/rejected": -7.2757744789123535, "loss": 0.6037, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.733677864074707, "rewards/margins": 0.5420972108840942, "rewards/rejected": -7.2757744789123535, "semantic_entropy": 0.015751570463180542, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 30.288720792247716, "learning_rate": 9.97700834996658e-07, "logits/chosen": 0.3281204402446747, "logits/rejected": 0.449833482503891, "logps/chosen": -7.066946983337402, "logps/rejected": -7.522922515869141, "loss": 0.6432, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -7.066946983337402, "rewards/margins": 0.4559754431247711, "rewards/rejected": -7.522922515869141, "semantic_entropy": 0.010822773911058903, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 21.97533323049887, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.4738125205039978, "logits/rejected": 0.6201906204223633, "logps/chosen": -6.687346458435059, "logps/rejected": -7.375749111175537, "loss": 0.5906, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.687346458435059, "rewards/margins": 0.6884029507637024, "rewards/rejected": -7.375749111175537, "semantic_entropy": 0.01416093111038208, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 16.154939819122443, "learning_rate": 9.973928157497674e-07, "logits/chosen": 0.397294819355011, "logits/rejected": 0.4953377842903137, "logps/chosen": -6.555293083190918, "logps/rejected": -7.133930206298828, "loss": 0.5751, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.555293083190918, "rewards/margins": 0.5786372423171997, "rewards/rejected": -7.133930206298828, "semantic_entropy": 0.016166144981980324, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 18.66427550708316, "learning_rate": 9.972315668065927e-07, "logits/chosen": 0.39967241883277893, "logits/rejected": 0.4871141314506531, "logps/chosen": -6.588616371154785, "logps/rejected": -7.018074989318848, "loss": 0.6501, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -6.588616371154785, "rewards/margins": 0.4294595718383789, "rewards/rejected": -7.018074989318848, "semantic_entropy": 0.015043877065181732, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 19.58743135959893, "learning_rate": 9.97065493720576e-07, "logits/chosen": 0.424797385931015, "logits/rejected": 0.5106293559074402, "logps/chosen": -6.42412805557251, "logps/rejected": -6.823407173156738, "loss": 0.661, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.42412805557251, "rewards/margins": 0.3992784023284912, "rewards/rejected": -6.823407173156738, "semantic_entropy": 0.017190445214509964, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 19.710530643393298, "learning_rate": 9.968945981029594e-07, "logits/chosen": 0.5481308102607727, "logits/rejected": 0.6441117525100708, "logps/chosen": -6.604589939117432, "logps/rejected": -7.156881809234619, "loss": 0.5894, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.604589939117432, "rewards/margins": 0.5522912740707397, "rewards/rejected": -7.156881809234619, "semantic_entropy": 0.014214654453098774, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 17.871273849433326, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.6293722987174988, "logits/rejected": 0.6717933416366577, "logps/chosen": -6.835976600646973, "logps/rejected": -7.2494215965271, "loss": 0.6631, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -6.835976600646973, "rewards/margins": 0.4134441316127777, "rewards/rejected": -7.2494215965271, "semantic_entropy": 0.01153610274195671, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 16.865148721700457, "learning_rate": 9.965383459518179e-07, "logits/chosen": 0.541202962398529, "logits/rejected": 0.6529287695884705, "logps/chosen": -6.675736904144287, "logps/rejected": -7.145176887512207, "loss": 0.6284, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.675736904144287, "rewards/margins": 0.4694399833679199, "rewards/rejected": -7.145176887512207, "semantic_entropy": 0.013248731382191181, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 23.498295974070185, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.5743650197982788, "logits/rejected": 0.663760781288147, "logps/chosen": -6.699901580810547, "logps/rejected": -7.10396671295166, "loss": 0.6749, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.699901580810547, "rewards/margins": 0.4040653109550476, "rewards/rejected": -7.10396671295166, "semantic_entropy": 0.014644038863480091, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 15.10343166970609, "learning_rate": 9.961628241785746e-07, "logits/chosen": 0.4620290696620941, "logits/rejected": 0.5227762460708618, "logps/chosen": -6.713381767272949, "logps/rejected": -7.118601322174072, "loss": 0.6779, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.713381767272949, "rewards/margins": 0.4052188992500305, "rewards/rejected": -7.118601322174072, "semantic_entropy": 0.014408141374588013, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 16.82158650124028, "learning_rate": 9.959678417085998e-07, "logits/chosen": 0.4263577461242676, "logits/rejected": 0.4895492494106293, "logps/chosen": -6.6601409912109375, "logps/rejected": -7.117767333984375, "loss": 0.6173, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -6.6601409912109375, "rewards/margins": 0.45762643218040466, "rewards/rejected": -7.117767333984375, "semantic_entropy": 0.014361525885760784, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 13.610647877557357, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.5320577621459961, "logits/rejected": 0.6060940027236938, "logps/chosen": -6.88693904876709, "logps/rejected": -7.462642669677734, "loss": 0.5833, "rewards/accuracies": 0.6875, "rewards/chosen": -6.88693904876709, "rewards/margins": 0.5757043957710266, "rewards/rejected": -7.462642669677734, "semantic_entropy": 0.011815531179308891, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 10.55729867477025, "learning_rate": 9.95563443060529e-07, "logits/chosen": 0.4426754415035248, "logits/rejected": 0.527428150177002, "logps/chosen": -6.9573655128479, "logps/rejected": -7.31267786026001, "loss": 0.6804, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.9573655128479, "rewards/margins": 0.35531362891197205, "rewards/rejected": -7.31267786026001, "semantic_entropy": 0.011011673137545586, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 20.225405830042792, "learning_rate": 9.95354030805911e-07, "logits/chosen": 0.38837724924087524, "logits/rejected": 0.4707297384738922, "logps/chosen": -6.923590660095215, "logps/rejected": -7.26629638671875, "loss": 0.6241, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -6.923590660095215, "rewards/margins": 0.34270578622817993, "rewards/rejected": -7.26629638671875, "semantic_entropy": 0.010305705480277538, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 24.361936444306224, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.49288463592529297, "logits/rejected": 0.5485498309135437, "logps/chosen": -6.930272102355957, "logps/rejected": -7.437797546386719, "loss": 0.6253, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.930272102355957, "rewards/margins": 0.5075257420539856, "rewards/rejected": -7.437797546386719, "semantic_entropy": 0.0111698554828763, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 17.05301230734048, "learning_rate": 9.94920790594082e-07, "logits/chosen": 0.3991442918777466, "logits/rejected": 0.45411986112594604, "logps/chosen": -6.633962154388428, "logps/rejected": -7.100059509277344, "loss": 0.6083, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.633962154388428, "rewards/margins": 0.4660969376564026, "rewards/rejected": -7.100059509277344, "semantic_entropy": 0.014079605229198933, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 16.409030978097405, "learning_rate": 9.946969668401696e-07, "logits/chosen": 0.2830341160297394, "logits/rejected": 0.3877353072166443, "logps/chosen": -6.539282321929932, "logps/rejected": -7.096798896789551, "loss": 0.6047, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.539282321929932, "rewards/margins": 0.5575160384178162, "rewards/rejected": -7.096798896789551, "semantic_entropy": 0.015337374992668629, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 13.519739236635578, "learning_rate": 9.944683435341155e-07, "logits/chosen": 0.30631715059280396, "logits/rejected": 0.35199958086013794, "logps/chosen": -6.5635480880737305, "logps/rejected": -7.057257175445557, "loss": 0.5924, "rewards/accuracies": 0.6875, "rewards/chosen": -6.5635480880737305, "rewards/margins": 0.49370861053466797, "rewards/rejected": -7.057257175445557, "semantic_entropy": 0.015060871839523315, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.4883604347705841, "eval_logits/rejected": 0.5385698676109314, "eval_logps/chosen": -6.740940093994141, "eval_logps/rejected": -7.298737049102783, "eval_loss": 0.5850783586502075, "eval_rewards/accuracies": 0.6810088753700256, "eval_rewards/chosen": -6.740940093994141, "eval_rewards/margins": 0.5577963590621948, "eval_rewards/rejected": -7.298737049102783, "eval_runtime": 34.813, "eval_samples_per_second": 38.635, "eval_semantic_entropy": 0.013112816959619522, "eval_steps_per_second": 9.68, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 19.237636205467812, "learning_rate": 9.942349228940236e-07, "logits/chosen": 0.30846095085144043, "logits/rejected": 0.3973791003227234, "logps/chosen": -6.796361446380615, "logps/rejected": -7.3480072021484375, "loss": 0.5769, "rewards/accuracies": 0.71875, "rewards/chosen": -6.796361446380615, "rewards/margins": 0.551645040512085, "rewards/rejected": -7.3480072021484375, "semantic_entropy": 0.01173459179699421, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 17.157117885795127, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.30237749218940735, "logits/rejected": 0.3430071473121643, "logps/chosen": -6.8641839027404785, "logps/rejected": -7.332343101501465, "loss": 0.6095, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -6.8641839027404785, "rewards/margins": 0.4681592583656311, "rewards/rejected": -7.332343101501465, "semantic_entropy": 0.013705052435398102, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 17.86668130034969, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.31386134028434753, "logits/rejected": 0.3876408636569977, "logps/chosen": -6.719006538391113, "logps/rejected": -7.485579490661621, "loss": 0.5959, "rewards/accuracies": 0.71875, "rewards/chosen": -6.719006538391113, "rewards/margins": 0.7665729522705078, "rewards/rejected": -7.485579490661621, "semantic_entropy": 0.016674160957336426, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 14.614768670805427, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.39349859952926636, "logits/rejected": 0.4051267206668854, "logps/chosen": -7.036497592926025, "logps/rejected": -7.662347316741943, "loss": 0.5724, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -7.036497592926025, "rewards/margins": 0.6258499622344971, "rewards/rejected": -7.662347316741943, "semantic_entropy": 0.012881157919764519, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 30.403962010078292, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.3851960599422455, "logits/rejected": 0.45596402883529663, "logps/chosen": -7.236742973327637, "logps/rejected": -7.798255920410156, "loss": 0.6115, "rewards/accuracies": 0.71875, "rewards/chosen": -7.236742973327637, "rewards/margins": 0.5615121126174927, "rewards/rejected": -7.798255920410156, "semantic_entropy": 0.013946113176643848, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 20.869180068401533, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.45160192251205444, "logits/rejected": 0.5339521765708923, "logps/chosen": -7.302011966705322, "logps/rejected": -7.983066558837891, "loss": 0.5467, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -7.302011966705322, "rewards/margins": 0.6810555458068848, "rewards/rejected": -7.983066558837891, "semantic_entropy": 0.010290712118148804, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 19.2525609808125, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.5376949906349182, "logits/rejected": 0.588058590888977, "logps/chosen": -7.635756492614746, "logps/rejected": -8.161395072937012, "loss": 0.6001, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.635756492614746, "rewards/margins": 0.5256373286247253, "rewards/rejected": -8.161395072937012, "semantic_entropy": 0.008404644206166267, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 19.454601896686906, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.5822176933288574, "logits/rejected": 0.6964151263237, "logps/chosen": -7.75359582901001, "logps/rejected": -8.28437614440918, "loss": 0.6008, "rewards/accuracies": 0.6875, "rewards/chosen": -7.75359582901001, "rewards/margins": 0.5307798981666565, "rewards/rejected": -8.28437614440918, "semantic_entropy": 0.006352287717163563, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 16.509437642167754, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.6624695658683777, "logits/rejected": 0.6998416185379028, "logps/chosen": -7.631247043609619, "logps/rejected": -8.026775360107422, "loss": 0.6553, "rewards/accuracies": 0.65625, "rewards/chosen": -7.631247043609619, "rewards/margins": 0.39552828669548035, "rewards/rejected": -8.026775360107422, "semantic_entropy": 0.00823633000254631, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 18.796921800518653, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.6312128305435181, "logits/rejected": 0.6908556222915649, "logps/chosen": -7.528157711029053, "logps/rejected": -8.073545455932617, "loss": 0.565, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.528157711029053, "rewards/margins": 0.545387864112854, "rewards/rejected": -8.073545455932617, "semantic_entropy": 0.007035645190626383, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 27.677336964086486, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.5352962017059326, "logits/rejected": 0.6097812056541443, "logps/chosen": -7.2810773849487305, "logps/rejected": -7.931620121002197, "loss": 0.6214, "rewards/accuracies": 0.71875, "rewards/chosen": -7.2810773849487305, "rewards/margins": 0.6505423188209534, "rewards/rejected": -7.931620121002197, "semantic_entropy": 0.009111289866268635, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 14.272226362354068, "learning_rate": 9.913513527293234e-07, "logits/chosen": 0.34996968507766724, "logits/rejected": 0.4666077494621277, "logps/chosen": -7.2218828201293945, "logps/rejected": -7.859616756439209, "loss": 0.6001, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -7.2218828201293945, "rewards/margins": 0.6377342939376831, "rewards/rejected": -7.859616756439209, "semantic_entropy": 0.009801121428608894, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 26.063412987584922, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.33111342787742615, "logits/rejected": 0.40587443113327026, "logps/chosen": -7.022481441497803, "logps/rejected": -7.678803443908691, "loss": 0.605, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.022481441497803, "rewards/margins": 0.6563228368759155, "rewards/rejected": -7.678803443908691, "semantic_entropy": 0.013630586676299572, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 13.43476713833555, "learning_rate": 9.907649910229227e-07, "logits/chosen": 0.22445161640644073, "logits/rejected": 0.39778199791908264, "logps/chosen": -6.874536037445068, "logps/rejected": -7.5805511474609375, "loss": 0.5545, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.874536037445068, "rewards/margins": 0.7060148119926453, "rewards/rejected": -7.5805511474609375, "semantic_entropy": 0.014564545825123787, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 20.657905051442476, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.3672102391719818, "logits/rejected": 0.41505926847457886, "logps/chosen": -7.1059699058532715, "logps/rejected": -7.5586981773376465, "loss": 0.6699, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -7.1059699058532715, "rewards/margins": 0.4527283310890198, "rewards/rejected": -7.5586981773376465, "semantic_entropy": 0.01175951398909092, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 10.963959193632688, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.3955201506614685, "logits/rejected": 0.5143112540245056, "logps/chosen": -7.272347927093506, "logps/rejected": -7.984310150146484, "loss": 0.5331, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -7.272347927093506, "rewards/margins": 0.7119626998901367, "rewards/rejected": -7.984310150146484, "semantic_entropy": 0.009277241304516792, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 17.54527652296308, "learning_rate": 9.898497453324384e-07, "logits/chosen": 0.330959290266037, "logits/rejected": 0.38617414236068726, "logps/chosen": -7.402396202087402, "logps/rejected": -7.947201728820801, "loss": 0.5931, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -7.402396202087402, "rewards/margins": 0.5448045134544373, "rewards/rejected": -7.947201728820801, "semantic_entropy": 0.008298173546791077, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 18.303499456390117, "learning_rate": 9.895351543941628e-07, "logits/chosen": 0.2591246962547302, "logits/rejected": 0.33385053277015686, "logps/chosen": -7.262864589691162, "logps/rejected": -7.755476951599121, "loss": 0.6183, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.262864589691162, "rewards/margins": 0.49261218309402466, "rewards/rejected": -7.755476951599121, "semantic_entropy": 0.012104134075343609, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 17.671561073979735, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.348507821559906, "logits/rejected": 0.4041469991207123, "logps/chosen": -7.320245265960693, "logps/rejected": -7.844791412353516, "loss": 0.6116, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -7.320245265960693, "rewards/margins": 0.5245463848114014, "rewards/rejected": -7.844791412353516, "semantic_entropy": 0.012385739013552666, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 19.73543179549833, "learning_rate": 9.88891727199209e-07, "logits/chosen": 0.2475900948047638, "logits/rejected": 0.30757012963294983, "logps/chosen": -7.321754455566406, "logps/rejected": -7.892062187194824, "loss": 0.6304, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.321754455566406, "rewards/margins": 0.5703079104423523, "rewards/rejected": -7.892062187194824, "semantic_entropy": 0.011291766539216042, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 23.01329046657235, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.3327587842941284, "logits/rejected": 0.4450058043003082, "logps/chosen": -7.170090675354004, "logps/rejected": -7.9321088790893555, "loss": 0.5644, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.170090675354004, "rewards/margins": 0.7620194554328918, "rewards/rejected": -7.9321088790893555, "semantic_entropy": 0.013015474192798138, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 13.383948275630102, "learning_rate": 9.882293271315481e-07, "logits/chosen": 0.3371312916278839, "logits/rejected": 0.39455828070640564, "logps/chosen": -7.155638217926025, "logps/rejected": -7.695284843444824, "loss": 0.6107, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.155638217926025, "rewards/margins": 0.5396467447280884, "rewards/rejected": -7.695284843444824, "semantic_entropy": 0.010182186029851437, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 17.28977001865095, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.3446193337440491, "logits/rejected": 0.4533708095550537, "logps/chosen": -7.136691093444824, "logps/rejected": -7.7163190841674805, "loss": 0.6031, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.136691093444824, "rewards/margins": 0.5796278715133667, "rewards/rejected": -7.7163190841674805, "semantic_entropy": 0.01082690805196762, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 16.51391506460225, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.3450031876564026, "logits/rejected": 0.4413267970085144, "logps/chosen": -6.917219638824463, "logps/rejected": -7.611997127532959, "loss": 0.5868, "rewards/accuracies": 0.71875, "rewards/chosen": -6.917219638824463, "rewards/margins": 0.6947778463363647, "rewards/rejected": -7.611997127532959, "semantic_entropy": 0.013024079613387585, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 22.468621981222572, "learning_rate": 9.87200209327504e-07, "logits/chosen": 0.30431827902793884, "logits/rejected": 0.4026539921760559, "logps/chosen": -7.283698081970215, "logps/rejected": -7.741362571716309, "loss": 0.6606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.283698081970215, "rewards/margins": 0.4576646387577057, "rewards/rejected": -7.741362571716309, "semantic_entropy": 0.009632373228669167, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 22.170244658612646, "learning_rate": 9.868477119388894e-07, "logits/chosen": 0.29118505120277405, "logits/rejected": 0.34077757596969604, "logps/chosen": -7.040729522705078, "logps/rejected": -7.742720127105713, "loss": 0.5839, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -7.040729522705078, "rewards/margins": 0.7019898295402527, "rewards/rejected": -7.742720127105713, "semantic_entropy": 0.01227110717445612, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 17.301303488197902, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.285023957490921, "logits/rejected": 0.3279130458831787, "logps/chosen": -7.310843467712402, "logps/rejected": -7.921121120452881, "loss": 0.5644, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -7.310843467712402, "rewards/margins": 0.6102767586708069, "rewards/rejected": -7.921121120452881, "semantic_entropy": 0.011266985908150673, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 17.308136534374302, "learning_rate": 9.861285504315084e-07, "logits/chosen": 0.2767130434513092, "logits/rejected": 0.3372814357280731, "logps/chosen": -7.225625038146973, "logps/rejected": -7.890871524810791, "loss": 0.5472, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.225625038146973, "rewards/margins": 0.6652467846870422, "rewards/rejected": -7.890871524810791, "semantic_entropy": 0.011087710037827492, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 18.83655138445813, "learning_rate": 9.857618932900502e-07, "logits/chosen": 0.25278183817863464, "logits/rejected": 0.37249675393104553, "logps/chosen": -7.409969329833984, "logps/rejected": -7.930933475494385, "loss": 0.6172, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -7.409969329833984, "rewards/margins": 0.5209641456604004, "rewards/rejected": -7.930933475494385, "semantic_entropy": 0.012186022475361824, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 22.919603952974256, "learning_rate": 9.853905232845727e-07, "logits/chosen": 0.22130601108074188, "logits/rejected": 0.3324377238750458, "logps/chosen": -7.2493391036987305, "logps/rejected": -7.837095737457275, "loss": 0.614, "rewards/accuracies": 0.71875, "rewards/chosen": -7.2493391036987305, "rewards/margins": 0.5877568125724792, "rewards/rejected": -7.837095737457275, "semantic_entropy": 0.013909459114074707, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 20.299888419343265, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.2346227467060089, "logits/rejected": 0.38050609827041626, "logps/chosen": -7.475827217102051, "logps/rejected": -8.115577697753906, "loss": 0.5694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.475827217102051, "rewards/margins": 0.639750599861145, "rewards/rejected": -8.115577697753906, "semantic_entropy": 0.011748342774808407, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 26.80684693873056, "learning_rate": 9.846336591393832e-07, "logits/chosen": 0.26571953296661377, "logits/rejected": 0.3704363703727722, "logps/chosen": -7.530020713806152, "logps/rejected": -8.231939315795898, "loss": 0.5966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.530020713806152, "rewards/margins": 0.7019174098968506, "rewards/rejected": -8.231939315795898, "semantic_entropy": 0.010797923430800438, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 22.781706412422338, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.3323562741279602, "logits/rejected": 0.3556436598300934, "logps/chosen": -7.796743869781494, "logps/rejected": -8.327234268188477, "loss": 0.6751, "rewards/accuracies": 0.65625, "rewards/chosen": -7.796743869781494, "rewards/margins": 0.5304909944534302, "rewards/rejected": -8.327234268188477, "semantic_entropy": 0.0087089529260993, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 15.79393005727687, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.37312960624694824, "logits/rejected": 0.370185911655426, "logps/chosen": -7.5613603591918945, "logps/rejected": -8.011152267456055, "loss": 0.6467, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -7.5613603591918945, "rewards/margins": 0.4497918486595154, "rewards/rejected": -8.011152267456055, "semantic_entropy": 0.008952843025326729, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 11.274721147056358, "learning_rate": 9.834631080014457e-07, "logits/chosen": 0.34255561232566833, "logits/rejected": 0.4821571409702301, "logps/chosen": -7.382586479187012, "logps/rejected": -7.989335060119629, "loss": 0.5608, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.382586479187012, "rewards/margins": 0.6067487001419067, "rewards/rejected": -7.989335060119629, "semantic_entropy": 0.00937967374920845, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 18.0683203101528, "learning_rate": 9.830635380734312e-07, "logits/chosen": 0.3637096583843231, "logits/rejected": 0.4652346074581146, "logps/chosen": -7.228192329406738, "logps/rejected": -7.7642951011657715, "loss": 0.5798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.228192329406738, "rewards/margins": 0.536103367805481, "rewards/rejected": -7.7642951011657715, "semantic_entropy": 0.010401034727692604, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 15.686670446006769, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.508113443851471, "logits/rejected": 0.6232683062553406, "logps/chosen": -7.083076477050781, "logps/rejected": -7.62778377532959, "loss": 0.5897, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -7.083076477050781, "rewards/margins": 0.5447085499763489, "rewards/rejected": -7.62778377532959, "semantic_entropy": 0.010786814615130424, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 11.8792829377083, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.5830814242362976, "logits/rejected": 0.602809488773346, "logps/chosen": -7.003039360046387, "logps/rejected": -7.556340217590332, "loss": 0.5887, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -7.003039360046387, "rewards/margins": 0.5533004999160767, "rewards/rejected": -7.556340217590332, "semantic_entropy": 0.01125816348940134, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 13.602049737746205, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.5771272778511047, "logits/rejected": 0.6160858869552612, "logps/chosen": -7.035143852233887, "logps/rejected": -7.554785251617432, "loss": 0.6097, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.035143852233887, "rewards/margins": 0.5196409225463867, "rewards/rejected": -7.554785251617432, "semantic_entropy": 0.010585736483335495, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 13.516147889237553, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.594735860824585, "logits/rejected": 0.6816811561584473, "logps/chosen": -7.359915256500244, "logps/rejected": -7.887757778167725, "loss": 0.5661, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.359915256500244, "rewards/margins": 0.52784264087677, "rewards/rejected": -7.887757778167725, "semantic_entropy": 0.00826399214565754, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 21.398826323428153, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.5744519829750061, "logits/rejected": 0.676822304725647, "logps/chosen": -7.203065395355225, "logps/rejected": -7.726864814758301, "loss": 0.5962, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -7.203065395355225, "rewards/margins": 0.5237992405891418, "rewards/rejected": -7.726864814758301, "semantic_entropy": 0.01075592078268528, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 13.803390343561437, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.5930423140525818, "logits/rejected": 0.6964749693870544, "logps/chosen": -6.960592746734619, "logps/rejected": -7.653602600097656, "loss": 0.5627, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.960592746734619, "rewards/margins": 0.6930093765258789, "rewards/rejected": -7.653602600097656, "semantic_entropy": 0.012550493702292442, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 17.08463390545894, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.5665202140808105, "logits/rejected": 0.6539164781570435, "logps/chosen": -6.99094295501709, "logps/rejected": -7.582823276519775, "loss": 0.5839, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.99094295501709, "rewards/margins": 0.5918795466423035, "rewards/rejected": -7.582823276519775, "semantic_entropy": 0.011810271069407463, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 16.08279102674936, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.5616046786308289, "logits/rejected": 0.6401379704475403, "logps/chosen": -6.898770809173584, "logps/rejected": -7.535937309265137, "loss": 0.5637, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.898770809173584, "rewards/margins": 0.6371673941612244, "rewards/rejected": -7.535937309265137, "semantic_entropy": 0.012174823321402073, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 17.72669785551067, "learning_rate": 9.792569880987724e-07, "logits/chosen": 0.5178264379501343, "logits/rejected": 0.5918501019477844, "logps/chosen": -7.055424690246582, "logps/rejected": -7.835641384124756, "loss": 0.5346, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.055424690246582, "rewards/margins": 0.7802165150642395, "rewards/rejected": -7.835641384124756, "semantic_entropy": 0.012361900880932808, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 23.667023672408078, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.5643856525421143, "logits/rejected": 0.615861713886261, "logps/chosen": -7.197871208190918, "logps/rejected": -7.710868835449219, "loss": 0.6595, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -7.197871208190918, "rewards/margins": 0.512997031211853, "rewards/rejected": -7.710868835449219, "semantic_entropy": 0.010254869237542152, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 18.12568347350743, "learning_rate": 9.783598330038924e-07, "logits/chosen": 0.607509434223175, "logits/rejected": 0.6717751622200012, "logps/chosen": -7.640361785888672, "logps/rejected": -8.166301727294922, "loss": 0.5919, "rewards/accuracies": 0.6875, "rewards/chosen": -7.640361785888672, "rewards/margins": 0.525938868522644, "rewards/rejected": -8.166301727294922, "semantic_entropy": 0.0061937421560287476, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 16.521524018519518, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.6564599871635437, "logits/rejected": 0.7573956251144409, "logps/chosen": -7.6685791015625, "logps/rejected": -8.591699600219727, "loss": 0.4835, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.6685791015625, "rewards/margins": 0.923120379447937, "rewards/rejected": -8.591699600219727, "semantic_entropy": 0.006848378572613001, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 23.54333369460797, "learning_rate": 9.774441137572487e-07, "logits/chosen": 0.6087485551834106, "logits/rejected": 0.704781711101532, "logps/chosen": -8.048932075500488, "logps/rejected": -8.719579696655273, "loss": 0.5672, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.048932075500488, "rewards/margins": 0.6706476211547852, "rewards/rejected": -8.719579696655273, "semantic_entropy": 0.005030449479818344, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 18.562047799738053, "learning_rate": 9.76979303654274e-07, "logits/chosen": 0.531555712223053, "logits/rejected": 0.5957599878311157, "logps/chosen": -8.234556198120117, "logps/rejected": -8.946748733520508, "loss": 0.5617, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.234556198120117, "rewards/margins": 0.7121928930282593, "rewards/rejected": -8.946748733520508, "semantic_entropy": 0.003957569133490324, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 20.633835966884693, "learning_rate": 9.765098658960035e-07, "logits/chosen": 0.5291253924369812, "logits/rejected": 0.5597686767578125, "logps/chosen": -8.142400741577148, "logps/rejected": -8.771413803100586, "loss": 0.5741, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.142400741577148, "rewards/margins": 0.6290136575698853, "rewards/rejected": -8.771413803100586, "semantic_entropy": 0.004258748609572649, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 34.54645017218859, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.5386477112770081, "logits/rejected": 0.6480933427810669, "logps/chosen": -8.24083137512207, "logps/rejected": -8.844766616821289, "loss": 0.5895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.24083137512207, "rewards/margins": 0.6039354801177979, "rewards/rejected": -8.844766616821289, "semantic_entropy": 0.004295586608350277, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 18.991335444449422, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.5871809124946594, "logits/rejected": 0.6752435564994812, "logps/chosen": -8.063031196594238, "logps/rejected": -8.696462631225586, "loss": 0.5889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -8.063031196594238, "rewards/margins": 0.6334304809570312, "rewards/rejected": -8.696462631225586, "semantic_entropy": 0.005001295357942581, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 13.986507130299717, "learning_rate": 9.750738324585097e-07, "logits/chosen": 0.5075832605361938, "logits/rejected": 0.65854811668396, "logps/chosen": -8.004261016845703, "logps/rejected": -8.6107177734375, "loss": 0.5822, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.004261016845703, "rewards/margins": 0.6064566373825073, "rewards/rejected": -8.6107177734375, "semantic_entropy": 0.004056010395288467, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 13.26346189390983, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.5564194917678833, "logits/rejected": 0.6411615014076233, "logps/chosen": -7.78427791595459, "logps/rejected": -8.535164833068848, "loss": 0.5486, "rewards/accuracies": 0.75, "rewards/chosen": -7.78427791595459, "rewards/margins": 0.7508861422538757, "rewards/rejected": -8.535164833068848, "semantic_entropy": 0.00624846201390028, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 17.424932509279063, "learning_rate": 9.740934232511892e-07, "logits/chosen": 0.5387696623802185, "logits/rejected": 0.6152251362800598, "logps/chosen": -7.787422180175781, "logps/rejected": -8.456579208374023, "loss": 0.5961, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -7.787422180175781, "rewards/margins": 0.6691574454307556, "rewards/rejected": -8.456579208374023, "semantic_entropy": 0.005864334292709827, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 13.701215181738062, "learning_rate": 9.735963167736698e-07, "logits/chosen": 0.6226884126663208, "logits/rejected": 0.7190333008766174, "logps/chosen": -7.892449855804443, "logps/rejected": -8.544393539428711, "loss": 0.5802, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -7.892449855804443, "rewards/margins": 0.6519426107406616, "rewards/rejected": -8.544393539428711, "semantic_entropy": 0.005305818282067776, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 24.146957684848896, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.6268946528434753, "logits/rejected": 0.6841186285018921, "logps/chosen": -7.797842502593994, "logps/rejected": -8.302523612976074, "loss": 0.6651, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -7.797842502593994, "rewards/margins": 0.5046811699867249, "rewards/rejected": -8.302523612976074, "semantic_entropy": 0.006395612843334675, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 18.196449287747534, "learning_rate": 9.725883241855117e-07, "logits/chosen": 0.5718734264373779, "logits/rejected": 0.6677632331848145, "logps/chosen": -7.862264156341553, "logps/rejected": -8.452044486999512, "loss": 0.5838, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -7.862264156341553, "rewards/margins": 0.5897812843322754, "rewards/rejected": -8.452044486999512, "semantic_entropy": 0.005370546132326126, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 18.197375532898803, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.6339142322540283, "logits/rejected": 0.7179350852966309, "logps/chosen": -7.516765594482422, "logps/rejected": -8.200715065002441, "loss": 0.5646, "rewards/accuracies": 0.6875, "rewards/chosen": -7.516765594482422, "rewards/margins": 0.6839491128921509, "rewards/rejected": -8.200715065002441, "semantic_entropy": 0.007220913656055927, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 16.846636546681744, "learning_rate": 9.715619914258624e-07, "logits/chosen": 0.570271909236908, "logits/rejected": 0.6238844990730286, "logps/chosen": -7.6180243492126465, "logps/rejected": -8.167299270629883, "loss": 0.5952, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -7.6180243492126465, "rewards/margins": 0.549274206161499, "rewards/rejected": -8.167299270629883, "semantic_entropy": 0.006116692908108234, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 26.633505176750862, "learning_rate": 9.710419599007937e-07, "logits/chosen": 0.638900101184845, "logits/rejected": 0.7319290637969971, "logps/chosen": -7.582394599914551, "logps/rejected": -8.110502243041992, "loss": 0.5966, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -7.582394599914551, "rewards/margins": 0.5281090140342712, "rewards/rejected": -8.110502243041992, "semantic_entropy": 0.006717337761074305, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 27.411766077174338, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.6096881628036499, "logits/rejected": 0.7123531699180603, "logps/chosen": -7.56555700302124, "logps/rejected": -7.999688625335693, "loss": 0.6675, "rewards/accuracies": 0.65625, "rewards/chosen": -7.56555700302124, "rewards/margins": 0.4341324269771576, "rewards/rejected": -7.999688625335693, "semantic_entropy": 0.0064805252477526665, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 13.615665322183913, "learning_rate": 9.699881917868609e-07, "logits/chosen": 0.5667654275894165, "logits/rejected": 0.6309981346130371, "logps/chosen": -7.366901397705078, "logps/rejected": -7.9114861488342285, "loss": 0.6056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.366901397705078, "rewards/margins": 0.5445848703384399, "rewards/rejected": -7.9114861488342285, "semantic_entropy": 0.007139952387660742, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 15.675473698672269, "learning_rate": 9.694544654216594e-07, "logits/chosen": 0.5247001647949219, "logits/rejected": 0.6292780041694641, "logps/chosen": -7.417148590087891, "logps/rejected": -7.97311544418335, "loss": 0.5796, "rewards/accuracies": 0.71875, "rewards/chosen": -7.417148590087891, "rewards/margins": 0.5559675097465515, "rewards/rejected": -7.97311544418335, "semantic_entropy": 0.006966522429138422, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 16.21958228068165, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.6028557419776917, "logits/rejected": 0.6466434597969055, "logps/chosen": -7.145249366760254, "logps/rejected": -7.668765068054199, "loss": 0.5902, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -7.145249366760254, "rewards/margins": 0.5235155820846558, "rewards/rejected": -7.668765068054199, "semantic_entropy": 0.008800549432635307, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 15.585684935656099, "learning_rate": 9.683733539658138e-07, "logits/chosen": 0.6030339002609253, "logits/rejected": 0.7146845459938049, "logps/chosen": -7.388113975524902, "logps/rejected": -7.941763401031494, "loss": 0.5785, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.388113975524902, "rewards/margins": 0.5536485910415649, "rewards/rejected": -7.941763401031494, "semantic_entropy": 0.007055189460515976, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 15.602057313200888, "learning_rate": 9.678259793641178e-07, "logits/chosen": 0.5913205742835999, "logits/rejected": 0.602319598197937, "logps/chosen": -7.421736717224121, "logps/rejected": -7.810797214508057, "loss": 0.6116, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.421736717224121, "rewards/margins": 0.38906151056289673, "rewards/rejected": -7.810797214508057, "semantic_entropy": 0.00688566267490387, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 15.626443610629703, "learning_rate": 9.672740659127183e-07, "logits/chosen": 0.5252998471260071, "logits/rejected": 0.5994977355003357, "logps/chosen": -7.5483551025390625, "logps/rejected": -8.214573860168457, "loss": 0.5531, "rewards/accuracies": 0.71875, "rewards/chosen": -7.5483551025390625, "rewards/margins": 0.6662176251411438, "rewards/rejected": -8.214573860168457, "semantic_entropy": 0.007115071173757315, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 16.005271466438277, "learning_rate": 9.667176189662818e-07, "logits/chosen": 0.5813131332397461, "logits/rejected": 0.640332818031311, "logps/chosen": -7.900903224945068, "logps/rejected": -8.512435913085938, "loss": 0.5717, "rewards/accuracies": 0.71875, "rewards/chosen": -7.900903224945068, "rewards/margins": 0.6115323901176453, "rewards/rejected": -8.512435913085938, "semantic_entropy": 0.0051747518591582775, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 10.625567401564435, "learning_rate": 9.661566439234592e-07, "logits/chosen": 0.6257847547531128, "logits/rejected": 0.6622999906539917, "logps/chosen": -7.946097373962402, "logps/rejected": -8.47764778137207, "loss": 0.6041, "rewards/accuracies": 0.6875, "rewards/chosen": -7.946097373962402, "rewards/margins": 0.5315494537353516, "rewards/rejected": -8.47764778137207, "semantic_entropy": 0.005170217715203762, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 13.397344602674059, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.6595005989074707, "logits/rejected": 0.7116304636001587, "logps/chosen": -7.946617126464844, "logps/rejected": -8.630485534667969, "loss": 0.5581, "rewards/accuracies": 0.6875, "rewards/chosen": -7.946617126464844, "rewards/margins": 0.6838675737380981, "rewards/rejected": -8.630485534667969, "semantic_entropy": 0.005823346786201, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 15.865578560547265, "learning_rate": 9.650211313628636e-07, "logits/chosen": 0.5803102254867554, "logits/rejected": 0.6273818016052246, "logps/chosen": -7.978617191314697, "logps/rejected": -8.42393684387207, "loss": 0.6414, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -7.978617191314697, "rewards/margins": 0.44531959295272827, "rewards/rejected": -8.42393684387207, "semantic_entropy": 0.0065127527341246605, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 15.673714953896777, "learning_rate": 9.644466048618386e-07, "logits/chosen": 0.5762825608253479, "logits/rejected": 0.6548576354980469, "logps/chosen": -8.140003204345703, "logps/rejected": -8.694659233093262, "loss": 0.5987, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.140003204345703, "rewards/margins": 0.5546567440032959, "rewards/rejected": -8.694659233093262, "semantic_entropy": 0.005027764476835728, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 14.354746897825013, "learning_rate": 9.63867572297816e-07, "logits/chosen": 0.5808348655700684, "logits/rejected": 0.6876333355903625, "logps/chosen": -7.959421634674072, "logps/rejected": -8.607701301574707, "loss": 0.5651, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.959421634674072, "rewards/margins": 0.6482798457145691, "rewards/rejected": -8.607701301574707, "semantic_entropy": 0.006594679318368435, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 12.98455385650067, "learning_rate": 9.632840392885727e-07, "logits/chosen": 0.5893815755844116, "logits/rejected": 0.661659836769104, "logps/chosen": -7.993622779846191, "logps/rejected": -8.655765533447266, "loss": 0.5828, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -7.993622779846191, "rewards/margins": 0.6621420979499817, "rewards/rejected": -8.655765533447266, "semantic_entropy": 0.00508379889652133, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 14.115881656600934, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.6585602164268494, "logits/rejected": 0.7358173131942749, "logps/chosen": -7.811059474945068, "logps/rejected": -8.613174438476562, "loss": 0.5195, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.811059474945068, "rewards/margins": 0.8021153211593628, "rewards/rejected": -8.613174438476562, "semantic_entropy": 0.006301518529653549, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 16.283892207062348, "learning_rate": 9.621034946237909e-07, "logits/chosen": 0.6387815475463867, "logits/rejected": 0.7039491534233093, "logps/chosen": -7.9904046058654785, "logps/rejected": -8.646058082580566, "loss": 0.5645, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.9904046058654785, "rewards/margins": 0.655653715133667, "rewards/rejected": -8.646058082580566, "semantic_entropy": 0.005252276547253132, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 15.655836290539325, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.6491774320602417, "logits/rejected": 0.7328025698661804, "logps/chosen": -7.836843013763428, "logps/rejected": -8.513358116149902, "loss": 0.5399, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.836843013763428, "rewards/margins": 0.6765137910842896, "rewards/rejected": -8.513358116149902, "semantic_entropy": 0.005473036784678698, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 22.62708626022183, "learning_rate": 9.609050166819803e-07, "logits/chosen": 0.5962838530540466, "logits/rejected": 0.6391795873641968, "logps/chosen": -8.035821914672852, "logps/rejected": -8.608399391174316, "loss": 0.5951, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.035821914672852, "rewards/margins": 0.572577953338623, "rewards/rejected": -8.608399391174316, "semantic_entropy": 0.005160279106348753, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.7506793141365051, "eval_logits/rejected": 0.7968686819076538, "eval_logps/chosen": -7.988265514373779, "eval_logps/rejected": -8.681319236755371, "eval_loss": 0.5522213578224182, "eval_rewards/accuracies": 0.7062314748764038, "eval_rewards/chosen": -7.988265514373779, "eval_rewards/margins": 0.6930533647537231, "eval_rewards/rejected": -8.681319236755371, "eval_runtime": 35.081, "eval_samples_per_second": 38.34, "eval_semantic_entropy": 0.004989789333194494, "eval_steps_per_second": 9.606, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 18.377278845630318, "learning_rate": 9.602990672395653e-07, "logits/chosen": 0.582940399646759, "logits/rejected": 0.6627975106239319, "logps/chosen": -8.004450798034668, "logps/rejected": -8.656949043273926, "loss": 0.5505, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.004450798034668, "rewards/margins": 0.6524981260299683, "rewards/rejected": -8.656949043273926, "semantic_entropy": 0.004912947304546833, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 13.700239190708754, "learning_rate": 9.59688651973581e-07, "logits/chosen": 0.7021932601928711, "logits/rejected": 0.7906457185745239, "logps/chosen": -8.091392517089844, "logps/rejected": -8.654991149902344, "loss": 0.587, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.091392517089844, "rewards/margins": 0.563599169254303, "rewards/rejected": -8.654991149902344, "semantic_entropy": 0.004794766195118427, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 15.628975304693077, "learning_rate": 9.590737768062792e-07, "logits/chosen": 0.6097584962844849, "logits/rejected": 0.6667622327804565, "logps/chosen": -8.029305458068848, "logps/rejected": -8.514669418334961, "loss": 0.619, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.029305458068848, "rewards/margins": 0.4853641390800476, "rewards/rejected": -8.514669418334961, "semantic_entropy": 0.004363791085779667, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 14.141900238974408, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.7649446725845337, "logits/rejected": 0.8241230249404907, "logps/chosen": -7.659104824066162, "logps/rejected": -8.234978675842285, "loss": 0.5818, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -7.659104824066162, "rewards/margins": 0.5758742094039917, "rewards/rejected": -8.234978675842285, "semantic_entropy": 0.006544353906065226, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 17.44536516233375, "learning_rate": 9.578306706730215e-07, "logits/chosen": 0.6202625036239624, "logits/rejected": 0.7067128419876099, "logps/chosen": -7.734452724456787, "logps/rejected": -8.231317520141602, "loss": 0.6291, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.734452724456787, "rewards/margins": 0.4968656599521637, "rewards/rejected": -8.231317520141602, "semantic_entropy": 0.006045544985681772, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 14.058747132134709, "learning_rate": 9.572024517676865e-07, "logits/chosen": 0.6863638162612915, "logits/rejected": 0.7385177612304688, "logps/chosen": -7.626795768737793, "logps/rejected": -8.15473461151123, "loss": 0.6068, "rewards/accuracies": 0.65625, "rewards/chosen": -7.626795768737793, "rewards/margins": 0.5279384851455688, "rewards/rejected": -8.15473461151123, "semantic_entropy": 0.006082098465412855, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 15.524786458902534, "learning_rate": 9.565697970821593e-07, "logits/chosen": 0.6960703134536743, "logits/rejected": 0.7752768397331238, "logps/chosen": -7.594348907470703, "logps/rejected": -8.13396167755127, "loss": 0.5959, "rewards/accuracies": 0.65625, "rewards/chosen": -7.594348907470703, "rewards/margins": 0.5396129488945007, "rewards/rejected": -8.13396167755127, "semantic_entropy": 0.0065464479848742485, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 10.898687489202283, "learning_rate": 9.559327127544585e-07, "logits/chosen": 0.6455325484275818, "logits/rejected": 0.7051125764846802, "logps/chosen": -7.510709285736084, "logps/rejected": -8.048765182495117, "loss": 0.5766, "rewards/accuracies": 0.65625, "rewards/chosen": -7.510709285736084, "rewards/margins": 0.5380562543869019, "rewards/rejected": -8.048765182495117, "semantic_entropy": 0.007156215608119965, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 18.786409391603485, "learning_rate": 9.552912049655789e-07, "logits/chosen": 0.6517975330352783, "logits/rejected": 0.7252013683319092, "logps/chosen": -7.325045108795166, "logps/rejected": -7.9806952476501465, "loss": 0.5695, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -7.325045108795166, "rewards/margins": 0.6556496620178223, "rewards/rejected": -7.9806952476501465, "semantic_entropy": 0.00768858939409256, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 24.445945587094794, "learning_rate": 9.546452799394315e-07, "logits/chosen": 0.6680857539176941, "logits/rejected": 0.7646031975746155, "logps/chosen": -7.549722194671631, "logps/rejected": -8.019618034362793, "loss": 0.645, "rewards/accuracies": 0.625, "rewards/chosen": -7.549722194671631, "rewards/margins": 0.46989649534225464, "rewards/rejected": -8.019618034362793, "semantic_entropy": 0.006873616483062506, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 15.014444640765525, "learning_rate": 9.539949439427846e-07, "logits/chosen": 0.6218008995056152, "logits/rejected": 0.6838528513908386, "logps/chosen": -7.445742607116699, "logps/rejected": -8.093426704406738, "loss": 0.5457, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.445742607116699, "rewards/margins": 0.6476832628250122, "rewards/rejected": -8.093426704406738, "semantic_entropy": 0.007571948226541281, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 12.345959930188625, "learning_rate": 9.533402032852002e-07, "logits/chosen": 0.5849351286888123, "logits/rejected": 0.6522541642189026, "logps/chosen": -7.5591864585876465, "logps/rejected": -8.265511512756348, "loss": 0.5424, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.5591864585876465, "rewards/margins": 0.7063250541687012, "rewards/rejected": -8.265511512756348, "semantic_entropy": 0.006167138926684856, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 16.266967829326354, "learning_rate": 9.526810643189754e-07, "logits/chosen": 0.6240657567977905, "logits/rejected": 0.7110647559165955, "logps/chosen": -7.6305341720581055, "logps/rejected": -8.264394760131836, "loss": 0.5468, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -7.6305341720581055, "rewards/margins": 0.6338610053062439, "rewards/rejected": -8.264394760131836, "semantic_entropy": 0.006204391364008188, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 16.69670345070714, "learning_rate": 9.52017533439079e-07, "logits/chosen": 0.5465856790542603, "logits/rejected": 0.5944739580154419, "logps/chosen": -7.692608833312988, "logps/rejected": -8.17878532409668, "loss": 0.6284, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.692608833312988, "rewards/margins": 0.48617634177207947, "rewards/rejected": -8.17878532409668, "semantic_entropy": 0.0065203020349144936, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 10.756532347044482, "learning_rate": 9.513496170830909e-07, "logits/chosen": 0.5842273235321045, "logits/rejected": 0.6651209592819214, "logps/chosen": -7.88360595703125, "logps/rejected": -8.369672775268555, "loss": 0.6449, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -7.88360595703125, "rewards/margins": 0.4860672950744629, "rewards/rejected": -8.369672775268555, "semantic_entropy": 0.005015389062464237, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 21.16131993716241, "learning_rate": 9.506773217311382e-07, "logits/chosen": 0.6626430153846741, "logits/rejected": 0.7511327862739563, "logps/chosen": -7.711289882659912, "logps/rejected": -8.375436782836914, "loss": 0.5492, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.711289882659912, "rewards/margins": 0.6641460657119751, "rewards/rejected": -8.375436782836914, "semantic_entropy": 0.006019088439643383, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 17.008433131854304, "learning_rate": 9.500006539058334e-07, "logits/chosen": 0.73247891664505, "logits/rejected": 0.7920357584953308, "logps/chosen": -7.982652187347412, "logps/rejected": -8.343966484069824, "loss": 0.6391, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -7.982652187347412, "rewards/margins": 0.3613142967224121, "rewards/rejected": -8.343966484069824, "semantic_entropy": 0.004631609655916691, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 13.160928071624928, "learning_rate": 9.493196201722109e-07, "logits/chosen": 0.6529003977775574, "logits/rejected": 0.7320042252540588, "logps/chosen": -7.793301582336426, "logps/rejected": -8.294574737548828, "loss": 0.6074, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -7.793301582336426, "rewards/margins": 0.5012733340263367, "rewards/rejected": -8.294574737548828, "semantic_entropy": 0.0051393527537584305, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 14.183716262535388, "learning_rate": 9.486342271376628e-07, "logits/chosen": 0.6803663969039917, "logits/rejected": 0.6954035758972168, "logps/chosen": -7.670355796813965, "logps/rejected": -8.38364028930664, "loss": 0.5344, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.670355796813965, "rewards/margins": 0.7132849097251892, "rewards/rejected": -8.38364028930664, "semantic_entropy": 0.006493359804153442, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 14.518556999601335, "learning_rate": 9.479444814518755e-07, "logits/chosen": 0.7013619542121887, "logits/rejected": 0.8164475560188293, "logps/chosen": -7.910555839538574, "logps/rejected": -8.657155990600586, "loss": 0.5453, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.910555839538574, "rewards/margins": 0.746599555015564, "rewards/rejected": -8.657155990600586, "semantic_entropy": 0.004653572104871273, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 12.861320996249733, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.7577477693557739, "logits/rejected": 0.7888853549957275, "logps/chosen": -7.878431797027588, "logps/rejected": -8.5064058303833, "loss": 0.5883, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.878431797027588, "rewards/margins": 0.6279749870300293, "rewards/rejected": -8.5064058303833, "semantic_entropy": 0.004988783039152622, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 14.945074518060963, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.8046930432319641, "logits/rejected": 0.8452129364013672, "logps/chosen": -7.966567039489746, "logps/rejected": -8.664255142211914, "loss": 0.5506, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -7.966567039489746, "rewards/margins": 0.6976876258850098, "rewards/rejected": -8.664255142211914, "semantic_entropy": 0.004865294322371483, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 18.100914575781236, "learning_rate": 9.458491956169914e-07, "logits/chosen": 0.8275071382522583, "logits/rejected": 0.8813290596008301, "logps/chosen": -8.298616409301758, "logps/rejected": -8.859460830688477, "loss": 0.6166, "rewards/accuracies": 0.6875, "rewards/chosen": -8.298616409301758, "rewards/margins": 0.5608429312705994, "rewards/rejected": -8.859460830688477, "semantic_entropy": 0.003682538866996765, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 15.788405153142321, "learning_rate": 9.451421066667215e-07, "logits/chosen": 0.7420376539230347, "logits/rejected": 0.8037020564079285, "logps/chosen": -8.253267288208008, "logps/rejected": -8.862098693847656, "loss": 0.5702, "rewards/accuracies": 0.71875, "rewards/chosen": -8.253267288208008, "rewards/margins": 0.6088317036628723, "rewards/rejected": -8.862098693847656, "semantic_entropy": 0.003497874829918146, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 20.04954090545044, "learning_rate": 9.444306989457805e-07, "logits/chosen": 0.8426326513290405, "logits/rejected": 0.8867173194885254, "logps/chosen": -8.014307022094727, "logps/rejected": -8.588244438171387, "loss": 0.6257, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -8.014307022094727, "rewards/margins": 0.5739374756813049, "rewards/rejected": -8.588244438171387, "semantic_entropy": 0.0046114143915474415, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 16.08088418888514, "learning_rate": 9.437149793562489e-07, "logits/chosen": 0.8074777722358704, "logits/rejected": 0.8401328921318054, "logps/chosen": -7.99387264251709, "logps/rejected": -8.621038436889648, "loss": 0.5737, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.99387264251709, "rewards/margins": 0.6271660327911377, "rewards/rejected": -8.621038436889648, "semantic_entropy": 0.004982014186680317, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 17.648574778030703, "learning_rate": 9.429949548420417e-07, "logits/chosen": 0.7622288465499878, "logits/rejected": 0.7995889782905579, "logps/chosen": -8.03836441040039, "logps/rejected": -8.615036010742188, "loss": 0.5845, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.03836441040039, "rewards/margins": 0.5766717195510864, "rewards/rejected": -8.615036010742188, "semantic_entropy": 0.005048284772783518, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 13.251467288565097, "learning_rate": 9.422706323888396e-07, "logits/chosen": 0.7949849963188171, "logits/rejected": 0.835627555847168, "logps/chosen": -8.13330078125, "logps/rejected": -8.713802337646484, "loss": 0.5896, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.13330078125, "rewards/margins": 0.580501139163971, "rewards/rejected": -8.713802337646484, "semantic_entropy": 0.00421832874417305, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 12.727013846864125, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.8075268864631653, "logits/rejected": 0.8839607238769531, "logps/chosen": -8.259916305541992, "logps/rejected": -9.057500839233398, "loss": 0.4887, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.259916305541992, "rewards/margins": 0.7975843548774719, "rewards/rejected": -9.057500839233398, "semantic_entropy": 0.003385394811630249, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 19.33189686664916, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.7991722226142883, "logits/rejected": 0.829816997051239, "logps/chosen": -8.155218124389648, "logps/rejected": -8.655525207519531, "loss": 0.6021, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.155218124389648, "rewards/margins": 0.5003066062927246, "rewards/rejected": -8.655525207519531, "semantic_entropy": 0.004503914155066013, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 21.351677483542304, "learning_rate": 9.400719478771449e-07, "logits/chosen": 0.7611302137374878, "logits/rejected": 0.8729363679885864, "logps/chosen": -8.371480941772461, "logps/rejected": -8.982285499572754, "loss": 0.5771, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.371480941772461, "rewards/margins": 0.6108050346374512, "rewards/rejected": -8.982285499572754, "semantic_entropy": 0.004243707284331322, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 15.476926041346282, "learning_rate": 9.393305043577209e-07, "logits/chosen": 0.7315706610679626, "logits/rejected": 0.7855316400527954, "logps/chosen": -8.193710327148438, "logps/rejected": -9.018750190734863, "loss": 0.5218, "rewards/accuracies": 0.71875, "rewards/chosen": -8.193710327148438, "rewards/margins": 0.8250393867492676, "rewards/rejected": -9.018750190734863, "semantic_entropy": 0.004040227737277746, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 11.327037956728795, "learning_rate": 9.38584798451817e-07, "logits/chosen": 0.6972507238388062, "logits/rejected": 0.7701762318611145, "logps/chosen": -8.05665397644043, "logps/rejected": -8.655900955200195, "loss": 0.5916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.05665397644043, "rewards/margins": 0.5992475748062134, "rewards/rejected": -8.655900955200195, "semantic_entropy": 0.0046203965321183205, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 26.290588474950017, "learning_rate": 9.37834837394275e-07, "logits/chosen": 0.6830715537071228, "logits/rejected": 0.7537237405776978, "logps/chosen": -8.189208030700684, "logps/rejected": -9.016082763671875, "loss": 0.5602, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.189208030700684, "rewards/margins": 0.8268746137619019, "rewards/rejected": -9.016082763671875, "semantic_entropy": 0.004340589977800846, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 14.319339627993806, "learning_rate": 9.370806284612203e-07, "logits/chosen": 0.6698434352874756, "logits/rejected": 0.7308276295661926, "logps/chosen": -8.377470016479492, "logps/rejected": -9.087924003601074, "loss": 0.53, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.377470016479492, "rewards/margins": 0.7104541063308716, "rewards/rejected": -9.087924003601074, "semantic_entropy": 0.0037077039014548063, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 14.5676067902578, "learning_rate": 9.363221789699912e-07, "logits/chosen": 0.6635018587112427, "logits/rejected": 0.7242141962051392, "logps/chosen": -8.505064010620117, "logps/rejected": -9.010017395019531, "loss": 0.6233, "rewards/accuracies": 0.65625, "rewards/chosen": -8.505064010620117, "rewards/margins": 0.5049545168876648, "rewards/rejected": -9.010017395019531, "semantic_entropy": 0.003181255189701915, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 16.866071551135633, "learning_rate": 9.355594962790682e-07, "logits/chosen": 0.6800563335418701, "logits/rejected": 0.7338107228279114, "logps/chosen": -8.420328140258789, "logps/rejected": -9.087446212768555, "loss": 0.5497, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.420328140258789, "rewards/margins": 0.6671197414398193, "rewards/rejected": -9.087446212768555, "semantic_entropy": 0.0032719075679779053, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 15.614567180768711, "learning_rate": 9.34792587788002e-07, "logits/chosen": 0.737398624420166, "logits/rejected": 0.7930010557174683, "logps/chosen": -8.552255630493164, "logps/rejected": -9.13463020324707, "loss": 0.5937, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.552255630493164, "rewards/margins": 0.5823749899864197, "rewards/rejected": -9.13463020324707, "semantic_entropy": 0.002839865395799279, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 17.35671690242798, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.7323909997940063, "logits/rejected": 0.7694789171218872, "logps/chosen": -8.701313972473145, "logps/rejected": -9.22875690460205, "loss": 0.5945, "rewards/accuracies": 0.6875, "rewards/chosen": -8.701313972473145, "rewards/margins": 0.5274431109428406, "rewards/rejected": -9.22875690460205, "semantic_entropy": 0.002377058146521449, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 10.711809648309112, "learning_rate": 9.332461232085646e-07, "logits/chosen": 0.6817182302474976, "logits/rejected": 0.7388890385627747, "logps/chosen": -8.783699989318848, "logps/rejected": -9.290229797363281, "loss": 0.5884, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.783699989318848, "rewards/margins": 0.5065295696258545, "rewards/rejected": -9.290229797363281, "semantic_entropy": 0.0022387620992958546, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 15.614698518847275, "learning_rate": 9.324665821239998e-07, "logits/chosen": 0.6605618000030518, "logits/rejected": 0.7475873827934265, "logps/chosen": -8.55317497253418, "logps/rejected": -9.264518737792969, "loss": 0.5843, "rewards/accuracies": 0.6875, "rewards/chosen": -8.55317497253418, "rewards/margins": 0.7113439440727234, "rewards/rejected": -9.264518737792969, "semantic_entropy": 0.002737089293077588, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 13.033534778573415, "learning_rate": 9.316828452467583e-07, "logits/chosen": 0.6980472803115845, "logits/rejected": 0.7700284719467163, "logps/chosen": -8.742466926574707, "logps/rejected": -9.366756439208984, "loss": 0.5443, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.742466926574707, "rewards/margins": 0.6242889165878296, "rewards/rejected": -9.366756439208984, "semantic_entropy": 0.0023838577326387167, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 38.89935633989603, "learning_rate": 9.30894920180659e-07, "logits/chosen": 0.7200860977172852, "logits/rejected": 0.7739099264144897, "logps/chosen": -8.618478775024414, "logps/rejected": -9.061718940734863, "loss": 0.6104, "rewards/accuracies": 0.65625, "rewards/chosen": -8.618478775024414, "rewards/margins": 0.4432406425476074, "rewards/rejected": -9.061718940734863, "semantic_entropy": 0.002750970423221588, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 12.996110838480991, "learning_rate": 9.301028145701543e-07, "logits/chosen": 0.6858905553817749, "logits/rejected": 0.7582074403762817, "logps/chosen": -8.569108963012695, "logps/rejected": -9.244118690490723, "loss": 0.5774, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.569108963012695, "rewards/margins": 0.6750102043151855, "rewards/rejected": -9.244118690490723, "semantic_entropy": 0.0030036987736821175, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 11.513246333141913, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.6833176612854004, "logits/rejected": 0.7447593212127686, "logps/chosen": -8.49552059173584, "logps/rejected": -9.083105087280273, "loss": 0.5847, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -8.49552059173584, "rewards/margins": 0.5875846147537231, "rewards/rejected": -9.083105087280273, "semantic_entropy": 0.003222426865249872, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 17.050277712672678, "learning_rate": 9.285060924964622e-07, "logits/chosen": 0.6484526991844177, "logits/rejected": 0.7113555669784546, "logps/chosen": -8.497220993041992, "logps/rejected": -9.029525756835938, "loss": 0.5907, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -8.497220993041992, "rewards/margins": 0.5323046445846558, "rewards/rejected": -9.029525756835938, "semantic_entropy": 0.002952256705611944, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 14.721214698389861, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.7316317558288574, "logits/rejected": 0.7572312355041504, "logps/chosen": -8.28238296508789, "logps/rejected": -8.961918830871582, "loss": 0.5423, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.28238296508789, "rewards/margins": 0.6795355677604675, "rewards/rejected": -8.961918830871582, "semantic_entropy": 0.004207999911159277, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 12.207483167169574, "learning_rate": 9.268927409911498e-07, "logits/chosen": 0.703294038772583, "logits/rejected": 0.7658201456069946, "logps/chosen": -8.169378280639648, "logps/rejected": -8.818994522094727, "loss": 0.5517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.169378280639648, "rewards/margins": 0.6496168971061707, "rewards/rejected": -8.818994522094727, "semantic_entropy": 0.0045172530226409435, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 31.132649463038923, "learning_rate": 9.260798487423749e-07, "logits/chosen": 0.6745550036430359, "logits/rejected": 0.7686847448348999, "logps/chosen": -8.215496063232422, "logps/rejected": -8.745055198669434, "loss": 0.5833, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.215496063232422, "rewards/margins": 0.5295597910881042, "rewards/rejected": -8.745055198669434, "semantic_entropy": 0.004211473278701305, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 20.17196061549839, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.68059903383255, "logits/rejected": 0.7309106588363647, "logps/chosen": -8.216412544250488, "logps/rejected": -8.789416313171387, "loss": 0.5969, "rewards/accuracies": 0.65625, "rewards/chosen": -8.216412544250488, "rewards/margins": 0.5730043649673462, "rewards/rejected": -8.789416313171387, "semantic_entropy": 0.004476086236536503, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 17.688599393649028, "learning_rate": 9.244416706859321e-07, "logits/chosen": 0.6764446496963501, "logits/rejected": 0.74993497133255, "logps/chosen": -8.021484375, "logps/rejected": -8.647329330444336, "loss": 0.5842, "rewards/accuracies": 0.71875, "rewards/chosen": -8.021484375, "rewards/margins": 0.6258445978164673, "rewards/rejected": -8.647329330444336, "semantic_entropy": 0.005138213746249676, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 14.905907733509586, "learning_rate": 9.23616400771875e-07, "logits/chosen": 0.6466782689094543, "logits/rejected": 0.7279826402664185, "logps/chosen": -7.967951774597168, "logps/rejected": -8.646588325500488, "loss": 0.5654, "rewards/accuracies": 0.71875, "rewards/chosen": -7.967951774597168, "rewards/margins": 0.6786371469497681, "rewards/rejected": -8.646588325500488, "semantic_entropy": 0.004834360908716917, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 11.247618506337135, "learning_rate": 9.227870209296395e-07, "logits/chosen": 0.6892117857933044, "logits/rejected": 0.7534765601158142, "logps/chosen": -8.057080268859863, "logps/rejected": -8.565564155578613, "loss": 0.6136, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -8.057080268859863, "rewards/margins": 0.5084843039512634, "rewards/rejected": -8.565564155578613, "semantic_entropy": 0.00426045898348093, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 13.37630930170591, "learning_rate": 9.219535392058728e-07, "logits/chosen": 0.6549677848815918, "logits/rejected": 0.6734327077865601, "logps/chosen": -8.016042709350586, "logps/rejected": -8.552094459533691, "loss": 0.6187, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -8.016042709350586, "rewards/margins": 0.5360512137413025, "rewards/rejected": -8.552094459533691, "semantic_entropy": 0.005072770640254021, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 13.216764994602103, "learning_rate": 9.211159636870181e-07, "logits/chosen": 0.717325747013092, "logits/rejected": 0.8044508695602417, "logps/chosen": -8.261899948120117, "logps/rejected": -8.891412734985352, "loss": 0.5755, "rewards/accuracies": 0.6875, "rewards/chosen": -8.261899948120117, "rewards/margins": 0.6295128464698792, "rewards/rejected": -8.891412734985352, "semantic_entropy": 0.0033580393064767122, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 15.240573495892372, "learning_rate": 9.202743024992367e-07, "logits/chosen": 0.8391082882881165, "logits/rejected": 0.8751834034919739, "logps/chosen": -8.093847274780273, "logps/rejected": -8.829301834106445, "loss": 0.5484, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.093847274780273, "rewards/margins": 0.7354532480239868, "rewards/rejected": -8.829301834106445, "semantic_entropy": 0.004077838733792305, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 14.604017721504034, "learning_rate": 9.194285638083293e-07, "logits/chosen": 0.8139607310295105, "logits/rejected": 0.8734992742538452, "logps/chosen": -8.397181510925293, "logps/rejected": -9.134657859802246, "loss": 0.5343, "rewards/accuracies": 0.71875, "rewards/chosen": -8.397181510925293, "rewards/margins": 0.7374764680862427, "rewards/rejected": -9.134657859802246, "semantic_entropy": 0.0033265065867453814, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 17.08531394574266, "learning_rate": 9.185787558196562e-07, "logits/chosen": 0.8408036231994629, "logits/rejected": 0.882840633392334, "logps/chosen": -8.239664077758789, "logps/rejected": -8.969578742980957, "loss": 0.5681, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.239664077758789, "rewards/margins": 0.7299133539199829, "rewards/rejected": -8.969578742980957, "semantic_entropy": 0.0042380583472549915, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 14.477147607243182, "learning_rate": 9.177248867780583e-07, "logits/chosen": 0.8981844186782837, "logits/rejected": 0.9408555030822754, "logps/chosen": -8.414166450500488, "logps/rejected": -8.8630952835083, "loss": 0.6356, "rewards/accuracies": 0.65625, "rewards/chosen": -8.414166450500488, "rewards/margins": 0.44892817735671997, "rewards/rejected": -8.8630952835083, "semantic_entropy": 0.003574197646230459, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 13.661227346120718, "learning_rate": 9.168669649677769e-07, "logits/chosen": 0.8391574621200562, "logits/rejected": 0.8985759019851685, "logps/chosen": -8.088191032409668, "logps/rejected": -8.639988899230957, "loss": 0.6163, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.088191032409668, "rewards/margins": 0.5517988801002502, "rewards/rejected": -8.639988899230957, "semantic_entropy": 0.00466513354331255, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 14.061529510910384, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.8700096011161804, "logits/rejected": 0.9046772718429565, "logps/chosen": -8.150907516479492, "logps/rejected": -8.664621353149414, "loss": 0.6186, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -8.150907516479492, "rewards/margins": 0.5137127637863159, "rewards/rejected": -8.664621353149414, "semantic_entropy": 0.004039828199893236, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 12.998907380952517, "learning_rate": 9.151389963746472e-07, "logits/chosen": 0.8303213119506836, "logits/rejected": 0.9604493379592896, "logps/chosen": -8.156909942626953, "logps/rejected": -8.906213760375977, "loss": 0.5157, "rewards/accuracies": 0.78125, "rewards/chosen": -8.156909942626953, "rewards/margins": 0.7493036985397339, "rewards/rejected": -8.906213760375977, "semantic_entropy": 0.004079463891685009, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 11.673455076007292, "learning_rate": 9.142689663565577e-07, "logits/chosen": 0.8863071203231812, "logits/rejected": 0.9234801530838013, "logps/chosen": -8.100628852844238, "logps/rejected": -8.736692428588867, "loss": 0.5499, "rewards/accuracies": 0.6875, "rewards/chosen": -8.100628852844238, "rewards/margins": 0.6360650062561035, "rewards/rejected": -8.736692428588867, "semantic_entropy": 0.0043214112520217896, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 14.906715664230957, "learning_rate": 9.133949170991397e-07, "logits/chosen": 0.8381370306015015, "logits/rejected": 0.8832274675369263, "logps/chosen": -8.123575210571289, "logps/rejected": -8.772272109985352, "loss": 0.5631, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.123575210571289, "rewards/margins": 0.6486952900886536, "rewards/rejected": -8.772272109985352, "semantic_entropy": 0.003872636239975691, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 13.00308584021274, "learning_rate": 9.125168570824231e-07, "logits/chosen": 0.826133131980896, "logits/rejected": 0.8961697816848755, "logps/chosen": -8.128072738647461, "logps/rejected": -8.749670028686523, "loss": 0.5707, "rewards/accuracies": 0.65625, "rewards/chosen": -8.128072738647461, "rewards/margins": 0.621599555015564, "rewards/rejected": -8.749670028686523, "semantic_entropy": 0.00478363037109375, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 22.234033697271784, "learning_rate": 9.116347948253496e-07, "logits/chosen": 0.7835792303085327, "logits/rejected": 0.8497790098190308, "logps/chosen": -8.275663375854492, "logps/rejected": -8.82735538482666, "loss": 0.5884, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -8.275663375854492, "rewards/margins": 0.5516918301582336, "rewards/rejected": -8.82735538482666, "semantic_entropy": 0.003584084566682577, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 13.347377014963383, "learning_rate": 9.107487388856916e-07, "logits/chosen": 0.7705615758895874, "logits/rejected": 0.8761787414550781, "logps/chosen": -8.147015571594238, "logps/rejected": -8.853046417236328, "loss": 0.5173, "rewards/accuracies": 0.78125, "rewards/chosen": -8.147015571594238, "rewards/margins": 0.7060302495956421, "rewards/rejected": -8.853046417236328, "semantic_entropy": 0.004314957652240992, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 18.743981589501008, "learning_rate": 9.098586978599673e-07, "logits/chosen": 0.7425702214241028, "logits/rejected": 0.8426684141159058, "logps/chosen": -8.143719673156738, "logps/rejected": -8.96298599243164, "loss": 0.5714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.143719673156738, "rewards/margins": 0.8192659616470337, "rewards/rejected": -8.96298599243164, "semantic_entropy": 0.00467184092849493, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 17.857816830477596, "learning_rate": 9.089646803833588e-07, "logits/chosen": 0.6718012094497681, "logits/rejected": 0.7780871987342834, "logps/chosen": -8.064419746398926, "logps/rejected": -8.771439552307129, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.064419746398926, "rewards/margins": 0.7070209383964539, "rewards/rejected": -8.771439552307129, "semantic_entropy": 0.004769052378833294, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 15.883695166387948, "learning_rate": 9.080666951296276e-07, "logits/chosen": 0.523202121257782, "logits/rejected": 0.7221616506576538, "logps/chosen": -7.929041385650635, "logps/rejected": -8.971317291259766, "loss": 0.4605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.929041385650635, "rewards/margins": 1.0422756671905518, "rewards/rejected": -8.971317291259766, "semantic_entropy": 0.0057184770703315735, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 11.62870813642443, "learning_rate": 9.071647508110305e-07, "logits/chosen": 0.5561312437057495, "logits/rejected": 0.7267721891403198, "logps/chosen": -7.7915754318237305, "logps/rejected": -8.73208999633789, "loss": 0.524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.7915754318237305, "rewards/margins": 0.9405128359794617, "rewards/rejected": -8.73208999633789, "semantic_entropy": 0.005991402082145214, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 12.493629792798085, "learning_rate": 9.062588561782354e-07, "logits/chosen": 0.6039088368415833, "logits/rejected": 0.6618218421936035, "logps/chosen": -8.060002326965332, "logps/rejected": -8.698019981384277, "loss": 0.5877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -8.060002326965332, "rewards/margins": 0.6380175352096558, "rewards/rejected": -8.698019981384277, "semantic_entropy": 0.004686708562076092, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 11.295471536198134, "learning_rate": 9.053490200202358e-07, "logits/chosen": 0.7054456472396851, "logits/rejected": 0.763306736946106, "logps/chosen": -8.17889404296875, "logps/rejected": -8.809242248535156, "loss": 0.5912, "rewards/accuracies": 0.6875, "rewards/chosen": -8.17889404296875, "rewards/margins": 0.6303480863571167, "rewards/rejected": -8.809242248535156, "semantic_entropy": 0.004566199611872435, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 18.547048805065355, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.7248358726501465, "logits/rejected": 0.7658167481422424, "logps/chosen": -8.291397094726562, "logps/rejected": -8.84311294555664, "loss": 0.6214, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.291397094726562, "rewards/margins": 0.5517162680625916, "rewards/rejected": -8.84311294555664, "semantic_entropy": 0.003865548875182867, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 12.961728802829933, "learning_rate": 9.03517558475716e-07, "logits/chosen": 0.721314549446106, "logits/rejected": 0.7908953428268433, "logps/chosen": -8.195411682128906, "logps/rejected": -8.733312606811523, "loss": 0.5662, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.195411682128906, "rewards/margins": 0.5379008650779724, "rewards/rejected": -8.733312606811523, "semantic_entropy": 0.004040508531033993, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 13.388709273345874, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.7946035861968994, "logits/rejected": 0.9081939458847046, "logps/chosen": -8.505581855773926, "logps/rejected": -9.189567565917969, "loss": 0.5373, "rewards/accuracies": 0.71875, "rewards/chosen": -8.505581855773926, "rewards/margins": 0.6839855313301086, "rewards/rejected": -9.189567565917969, "semantic_entropy": 0.003315441310405731, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 13.04328623863774, "learning_rate": 9.016704372526905e-07, "logits/chosen": 0.7168788313865662, "logits/rejected": 0.8062397837638855, "logps/chosen": -8.312705039978027, "logps/rejected": -8.956674575805664, "loss": 0.5598, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.312705039978027, "rewards/margins": 0.6439692378044128, "rewards/rejected": -8.956674575805664, "semantic_entropy": 0.004072139970958233, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 19.229092305162187, "learning_rate": 9.007410266389934e-07, "logits/chosen": 0.6322071552276611, "logits/rejected": 0.6842302680015564, "logps/chosen": -8.209632873535156, "logps/rejected": -8.76137638092041, "loss": 0.5783, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -8.209632873535156, "rewards/margins": 0.5517433881759644, "rewards/rejected": -8.76137638092041, "semantic_entropy": 0.003791673108935356, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 15.721412896614648, "learning_rate": 8.998077280340981e-07, "logits/chosen": 0.6889594793319702, "logits/rejected": 0.7277365922927856, "logps/chosen": -8.424080848693848, "logps/rejected": -8.998977661132812, "loss": 0.5645, "rewards/accuracies": 0.6875, "rewards/chosen": -8.424080848693848, "rewards/margins": 0.5748964548110962, "rewards/rejected": -8.998977661132812, "semantic_entropy": 0.0032078386284410954, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 12.71386552957631, "learning_rate": 8.988705504928722e-07, "logits/chosen": 0.6700653433799744, "logits/rejected": 0.7809063196182251, "logps/chosen": -8.39610481262207, "logps/rejected": -9.334978103637695, "loss": 0.4796, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.39610481262207, "rewards/margins": 0.938875675201416, "rewards/rejected": -9.334978103637695, "semantic_entropy": 0.00389484572224319, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.8517152070999146, "eval_logits/rejected": 0.9157667756080627, "eval_logps/chosen": -8.478996276855469, "eval_logps/rejected": -9.19737434387207, "eval_loss": 0.5405778884887695, "eval_rewards/accuracies": 0.7047477960586548, "eval_rewards/chosen": -8.478996276855469, "eval_rewards/margins": 0.7183785438537598, "eval_rewards/rejected": -9.19737434387207, "eval_runtime": 35.1436, "eval_samples_per_second": 38.272, "eval_semantic_entropy": 0.0034910058602690697, "eval_steps_per_second": 9.589, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 15.698469595260914, "learning_rate": 8.979295031078157e-07, "logits/chosen": 0.6854676008224487, "logits/rejected": 0.8206149935722351, "logps/chosen": -8.568761825561523, "logps/rejected": -9.31121826171875, "loss": 0.5156, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.568761825561523, "rewards/margins": 0.742457389831543, "rewards/rejected": -9.31121826171875, "semantic_entropy": 0.003175111021846533, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 13.477417972760096, "learning_rate": 8.969845950089751e-07, "logits/chosen": 0.699101448059082, "logits/rejected": 0.8196004033088684, "logps/chosen": -8.327180862426758, "logps/rejected": -9.16191577911377, "loss": 0.51, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.327180862426758, "rewards/margins": 0.8347347974777222, "rewards/rejected": -9.16191577911377, "semantic_entropy": 0.0040628439746797085, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 20.385879298528565, "learning_rate": 8.960358353638526e-07, "logits/chosen": 0.7844869494438171, "logits/rejected": 0.8709976077079773, "logps/chosen": -8.346048355102539, "logps/rejected": -9.043893814086914, "loss": 0.5844, "rewards/accuracies": 0.6875, "rewards/chosen": -8.346048355102539, "rewards/margins": 0.6978455781936646, "rewards/rejected": -9.043893814086914, "semantic_entropy": 0.004238657653331757, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 16.839936129408876, "learning_rate": 8.950832333773184e-07, "logits/chosen": 0.8071925044059753, "logits/rejected": 0.9016444087028503, "logps/chosen": -8.5030517578125, "logps/rejected": -9.184589385986328, "loss": 0.5976, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -8.5030517578125, "rewards/margins": 0.6815375685691833, "rewards/rejected": -9.184589385986328, "semantic_entropy": 0.0040657538920640945, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 16.82819858205259, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.8722259402275085, "logits/rejected": 0.9098442196846008, "logps/chosen": -8.707246780395508, "logps/rejected": -9.02901554107666, "loss": 0.7021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.707246780395508, "rewards/margins": 0.32176870107650757, "rewards/rejected": -9.02901554107666, "semantic_entropy": 0.0030822004191577435, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 12.54191217498305, "learning_rate": 8.931665393857983e-07, "logits/chosen": 0.853954017162323, "logits/rejected": 0.9320189356803894, "logps/chosen": -8.5809965133667, "logps/rejected": -9.205659866333008, "loss": 0.5758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.5809965133667, "rewards/margins": 0.6246632933616638, "rewards/rejected": -9.205659866333008, "semantic_entropy": 0.002861475106328726, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 13.279557320703779, "learning_rate": 8.922024659765861e-07, "logits/chosen": 0.830333411693573, "logits/rejected": 0.9043375849723816, "logps/chosen": -8.427899360656738, "logps/rejected": -9.160634994506836, "loss": 0.5253, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.427899360656738, "rewards/margins": 0.7327350378036499, "rewards/rejected": -9.160634994506836, "semantic_entropy": 0.0032138600945472717, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 19.39467151368381, "learning_rate": 8.912345874173288e-07, "logits/chosen": 0.8193842172622681, "logits/rejected": 0.8876082301139832, "logps/chosen": -8.680830001831055, "logps/rejected": -9.321008682250977, "loss": 0.5834, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.680830001831055, "rewards/margins": 0.6401779651641846, "rewards/rejected": -9.321008682250977, "semantic_entropy": 0.0026057157665491104, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 15.252904015477414, "learning_rate": 8.902629130983885e-07, "logits/chosen": 0.8152815103530884, "logits/rejected": 0.8414192199707031, "logps/chosen": -8.817136764526367, "logps/rejected": -9.296814918518066, "loss": 0.6108, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.817136764526367, "rewards/margins": 0.4796779751777649, "rewards/rejected": -9.296814918518066, "semantic_entropy": 0.002496039029210806, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 17.953373871726004, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.8935707211494446, "logits/rejected": 0.9353858232498169, "logps/chosen": -8.69524097442627, "logps/rejected": -9.359209060668945, "loss": 0.5249, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.69524097442627, "rewards/margins": 0.6639670133590698, "rewards/rejected": -9.359209060668945, "semantic_entropy": 0.002959498204290867, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 15.625644253516462, "learning_rate": 8.883082149269478e-07, "logits/chosen": 0.8291314840316772, "logits/rejected": 0.8965352177619934, "logps/chosen": -8.816374778747559, "logps/rejected": -9.487445831298828, "loss": 0.5349, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.816374778747559, "rewards/margins": 0.6710702180862427, "rewards/rejected": -9.487445831298828, "semantic_entropy": 0.0024764954578131437, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 15.866078991034088, "learning_rate": 8.873252100389377e-07, "logits/chosen": 0.8331910371780396, "logits/rejected": 0.8664076924324036, "logps/chosen": -8.80284595489502, "logps/rejected": -9.485407829284668, "loss": 0.5391, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.80284595489502, "rewards/margins": 0.6825627088546753, "rewards/rejected": -9.485407829284668, "semantic_entropy": 0.002537056338042021, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 18.091028298007316, "learning_rate": 8.863384473200411e-07, "logits/chosen": 0.8735591769218445, "logits/rejected": 0.8835130929946899, "logps/chosen": -9.021527290344238, "logps/rejected": -9.55382251739502, "loss": 0.5901, "rewards/accuracies": 0.6875, "rewards/chosen": -9.021527290344238, "rewards/margins": 0.532294750213623, "rewards/rejected": -9.55382251739502, "semantic_entropy": 0.0023630578070878983, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 15.178387048780932, "learning_rate": 8.853479363438342e-07, "logits/chosen": 0.8976732492446899, "logits/rejected": 0.9629983901977539, "logps/chosen": -9.05078411102295, "logps/rejected": -9.503996849060059, "loss": 0.63, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -9.05078411102295, "rewards/margins": 0.45321202278137207, "rewards/rejected": -9.503996849060059, "semantic_entropy": 0.0022809661459177732, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 16.07101375873566, "learning_rate": 8.843536867202588e-07, "logits/chosen": 0.8819114565849304, "logits/rejected": 0.9694005250930786, "logps/chosen": -8.932337760925293, "logps/rejected": -9.645790100097656, "loss": 0.5404, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.932337760925293, "rewards/margins": 0.7134513258934021, "rewards/rejected": -9.645790100097656, "semantic_entropy": 0.002526444150134921, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 22.308829335036855, "learning_rate": 8.833557080955292e-07, "logits/chosen": 0.8551505208015442, "logits/rejected": 0.8981190919876099, "logps/chosen": -8.808084487915039, "logps/rejected": -9.255620956420898, "loss": 0.644, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -8.808084487915039, "rewards/margins": 0.4475362300872803, "rewards/rejected": -9.255620956420898, "semantic_entropy": 0.0026800683699548244, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 20.755871823674013, "learning_rate": 8.823540101520381e-07, "logits/chosen": 0.8553838729858398, "logits/rejected": 0.9643619656562805, "logps/chosen": -8.639958381652832, "logps/rejected": -9.357548713684082, "loss": 0.5769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.639958381652832, "rewards/margins": 0.7175900340080261, "rewards/rejected": -9.357548713684082, "semantic_entropy": 0.002625108230859041, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 17.716986972701864, "learning_rate": 8.813486026082637e-07, "logits/chosen": 0.8817728161811829, "logits/rejected": 0.9899671673774719, "logps/chosen": -8.551309585571289, "logps/rejected": -9.368635177612305, "loss": 0.5262, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.551309585571289, "rewards/margins": 0.8173257112503052, "rewards/rejected": -9.368635177612305, "semantic_entropy": 0.0030341236852109432, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 26.70901823794411, "learning_rate": 8.803394952186742e-07, "logits/chosen": 0.7727741003036499, "logits/rejected": 0.8659934997558594, "logps/chosen": -8.45335578918457, "logps/rejected": -9.134082794189453, "loss": 0.528, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.45335578918457, "rewards/margins": 0.6807276606559753, "rewards/rejected": -9.134082794189453, "semantic_entropy": 0.0030217047315090895, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 15.766523978337645, "learning_rate": 8.793266977736342e-07, "logits/chosen": 0.8686197996139526, "logits/rejected": 0.8411453366279602, "logps/chosen": -8.678540229797363, "logps/rejected": -9.106678009033203, "loss": 0.6239, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.678540229797363, "rewards/margins": 0.42813801765441895, "rewards/rejected": -9.106678009033203, "semantic_entropy": 0.0028703988064080477, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 18.30778845018795, "learning_rate": 8.783102200993085e-07, "logits/chosen": 0.8415244817733765, "logits/rejected": 0.9102290868759155, "logps/chosen": -8.705827713012695, "logps/rejected": -9.459232330322266, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": -8.705827713012695, "rewards/margins": 0.753406286239624, "rewards/rejected": -9.459232330322266, "semantic_entropy": 0.002563622547313571, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 14.122552458570945, "learning_rate": 8.772900720575683e-07, "logits/chosen": 0.8687243461608887, "logits/rejected": 0.9228528738021851, "logps/chosen": -8.917773246765137, "logps/rejected": -9.463602066040039, "loss": 0.5956, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.917773246765137, "rewards/margins": 0.5458282828330994, "rewards/rejected": -9.463602066040039, "semantic_entropy": 0.002534933853894472, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 22.1831103043282, "learning_rate": 8.762662635458944e-07, "logits/chosen": 0.8289508819580078, "logits/rejected": 0.9157236218452454, "logps/chosen": -8.972761154174805, "logps/rejected": -9.611922264099121, "loss": 0.6294, "rewards/accuracies": 0.6875, "rewards/chosen": -8.972761154174805, "rewards/margins": 0.6391609907150269, "rewards/rejected": -9.611922264099121, "semantic_entropy": 0.0023491496685892344, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 18.48437033985848, "learning_rate": 8.752388044972811e-07, "logits/chosen": 0.8286212086677551, "logits/rejected": 0.8758748769760132, "logps/chosen": -8.496380805969238, "logps/rejected": -9.228724479675293, "loss": 0.5515, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.496380805969238, "rewards/margins": 0.7323442101478577, "rewards/rejected": -9.228724479675293, "semantic_entropy": 0.003955576568841934, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 15.320431126329213, "learning_rate": 8.74207704880141e-07, "logits/chosen": 0.7788019180297852, "logits/rejected": 0.8485990762710571, "logps/chosen": -8.44409465789795, "logps/rejected": -9.369488716125488, "loss": 0.4862, "rewards/accuracies": 0.78125, "rewards/chosen": -8.44409465789795, "rewards/margins": 0.9253931045532227, "rewards/rejected": -9.369488716125488, "semantic_entropy": 0.0033842413686215878, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 13.032187521778193, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.7908933758735657, "logits/rejected": 0.839871883392334, "logps/chosen": -8.113957405090332, "logps/rejected": -8.812549591064453, "loss": 0.5337, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.113957405090332, "rewards/margins": 0.6985923647880554, "rewards/rejected": -8.812549591064453, "semantic_entropy": 0.004230237565934658, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 19.494387499479178, "learning_rate": 8.721346239904355e-07, "logits/chosen": 0.72081059217453, "logits/rejected": 0.8257828950881958, "logps/chosen": -8.163381576538086, "logps/rejected": -8.904546737670898, "loss": 0.5976, "rewards/accuracies": 0.625, "rewards/chosen": -8.163381576538086, "rewards/margins": 0.7411641478538513, "rewards/rejected": -8.904546737670898, "semantic_entropy": 0.00452050007879734, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 18.390962651772945, "learning_rate": 8.710926628309101e-07, "logits/chosen": 0.7455258965492249, "logits/rejected": 0.8390616178512573, "logps/chosen": -8.214960098266602, "logps/rejected": -8.860664367675781, "loss": 0.5546, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.214960098266602, "rewards/margins": 0.6457030177116394, "rewards/rejected": -8.860664367675781, "semantic_entropy": 0.004450926091521978, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 12.45958358529586, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.779666543006897, "logits/rejected": 0.7992655038833618, "logps/chosen": -7.90356969833374, "logps/rejected": -8.546361923217773, "loss": 0.5456, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.90356969833374, "rewards/margins": 0.6427920460700989, "rewards/rejected": -8.546361923217773, "semantic_entropy": 0.005241268780082464, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 20.787825790509054, "learning_rate": 8.689979496279746e-07, "logits/chosen": 0.7400572896003723, "logits/rejected": 0.7830491065979004, "logps/chosen": -8.003057479858398, "logps/rejected": -8.49720573425293, "loss": 0.6642, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.003057479858398, "rewards/margins": 0.4941479563713074, "rewards/rejected": -8.49720573425293, "semantic_entropy": 0.005025799386203289, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 13.730264245914015, "learning_rate": 8.679452179074811e-07, "logits/chosen": 0.7920068502426147, "logits/rejected": 0.8651610612869263, "logps/chosen": -7.900570869445801, "logps/rejected": -8.649713516235352, "loss": 0.5124, "rewards/accuracies": 0.75, "rewards/chosen": -7.900570869445801, "rewards/margins": 0.7491430640220642, "rewards/rejected": -8.649713516235352, "semantic_entropy": 0.005135712679475546, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 13.626163938056823, "learning_rate": 8.668889163808698e-07, "logits/chosen": 0.7864473462104797, "logits/rejected": 0.8614629507064819, "logps/chosen": -7.7179975509643555, "logps/rejected": -8.315633773803711, "loss": 0.5636, "rewards/accuracies": 0.6875, "rewards/chosen": -7.7179975509643555, "rewards/margins": 0.5976354479789734, "rewards/rejected": -8.315633773803711, "semantic_entropy": 0.0065610273741185665, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 15.396621437912136, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.7842726111412048, "logits/rejected": 0.8139573335647583, "logps/chosen": -7.729952812194824, "logps/rejected": -8.440402030944824, "loss": 0.5618, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -7.729952812194824, "rewards/margins": 0.710450291633606, "rewards/rejected": -8.440402030944824, "semantic_entropy": 0.005991552956402302, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 11.779338418294584, "learning_rate": 8.647656449367966e-07, "logits/chosen": 0.7771416306495667, "logits/rejected": 0.8638502359390259, "logps/chosen": -7.753976345062256, "logps/rejected": -8.381518363952637, "loss": 0.5732, "rewards/accuracies": 0.71875, "rewards/chosen": -7.753976345062256, "rewards/margins": 0.6275419592857361, "rewards/rejected": -8.381518363952637, "semantic_entropy": 0.0064716823399066925, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 11.790444019030152, "learning_rate": 8.636986956193235e-07, "logits/chosen": 0.7170445919036865, "logits/rejected": 0.7984222173690796, "logps/chosen": -7.559231758117676, "logps/rejected": -8.215555191040039, "loss": 0.5718, "rewards/accuracies": 0.6875, "rewards/chosen": -7.559231758117676, "rewards/margins": 0.6563239097595215, "rewards/rejected": -8.215555191040039, "semantic_entropy": 0.007783152163028717, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 13.251274223031999, "learning_rate": 8.626282176955104e-07, "logits/chosen": 0.7697458863258362, "logits/rejected": 0.8472963571548462, "logps/chosen": -7.661177635192871, "logps/rejected": -8.409696578979492, "loss": 0.5248, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -7.661177635192871, "rewards/margins": 0.7485184073448181, "rewards/rejected": -8.409696578979492, "semantic_entropy": 0.006556454114615917, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 16.317970460951365, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.7925196886062622, "logits/rejected": 0.827374279499054, "logps/chosen": -7.7944183349609375, "logps/rejected": -8.290719985961914, "loss": 0.5983, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.7944183349609375, "rewards/margins": 0.4963007867336273, "rewards/rejected": -8.290719985961914, "semantic_entropy": 0.006235038861632347, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 17.388652579583002, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.7371417284011841, "logits/rejected": 0.7948885560035706, "logps/chosen": -7.8774542808532715, "logps/rejected": -8.463842391967773, "loss": 0.5767, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -7.8774542808532715, "rewards/margins": 0.5863882303237915, "rewards/rejected": -8.463842391967773, "semantic_entropy": 0.0055058179423213005, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 10.102522428025908, "learning_rate": 8.593957163144141e-07, "logits/chosen": 0.701524555683136, "logits/rejected": 0.7919615507125854, "logps/chosen": -7.6824631690979, "logps/rejected": -8.461966514587402, "loss": 0.516, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.6824631690979, "rewards/margins": 0.7795030474662781, "rewards/rejected": -8.461966514587402, "semantic_entropy": 0.007335428148508072, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 13.049955014081108, "learning_rate": 8.58311228163888e-07, "logits/chosen": 0.723731279373169, "logits/rejected": 0.7662105560302734, "logps/chosen": -7.902833461761475, "logps/rejected": -8.512142181396484, "loss": 0.5491, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.902833461761475, "rewards/margins": 0.6093090772628784, "rewards/rejected": -8.512142181396484, "semantic_entropy": 0.00509651331230998, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 17.055444027861736, "learning_rate": 8.57223263676255e-07, "logits/chosen": 0.651114821434021, "logits/rejected": 0.7337725758552551, "logps/chosen": -7.82892370223999, "logps/rejected": -8.779696464538574, "loss": 0.4542, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.82892370223999, "rewards/margins": 0.950772762298584, "rewards/rejected": -8.779696464538574, "semantic_entropy": 0.0056858672760427, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 12.069977661137925, "learning_rate": 8.561318334069511e-07, "logits/chosen": 0.722413182258606, "logits/rejected": 0.8114809989929199, "logps/chosen": -8.02531623840332, "logps/rejected": -8.714741706848145, "loss": 0.5643, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.02531623840332, "rewards/margins": 0.6894262433052063, "rewards/rejected": -8.714741706848145, "semantic_entropy": 0.004859632812440395, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 14.829212830302785, "learning_rate": 8.550369479450375e-07, "logits/chosen": 0.7346758842468262, "logits/rejected": 0.8068816065788269, "logps/chosen": -8.147111892700195, "logps/rejected": -8.884883880615234, "loss": 0.5406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.147111892700195, "rewards/margins": 0.7377720475196838, "rewards/rejected": -8.884883880615234, "semantic_entropy": 0.00493080448359251, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 16.49209421591941, "learning_rate": 8.539386179130977e-07, "logits/chosen": 0.7819596529006958, "logits/rejected": 0.8145734667778015, "logps/chosen": -8.00898551940918, "logps/rejected": -8.68702507019043, "loss": 0.5593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.00898551940918, "rewards/margins": 0.6780385971069336, "rewards/rejected": -8.68702507019043, "semantic_entropy": 0.005897555500268936, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 14.239228008160856, "learning_rate": 8.528368539671347e-07, "logits/chosen": 0.7752368450164795, "logits/rejected": 0.863335132598877, "logps/chosen": -7.959936618804932, "logps/rejected": -9.050023078918457, "loss": 0.4822, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.959936618804932, "rewards/margins": 1.0900851488113403, "rewards/rejected": -9.050023078918457, "semantic_entropy": 0.005652183201164007, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 16.436302978410623, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.8048080205917358, "logits/rejected": 0.8463269472122192, "logps/chosen": -8.244328498840332, "logps/rejected": -8.958102226257324, "loss": 0.5714, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.244328498840332, "rewards/margins": 0.7137740254402161, "rewards/rejected": -8.958102226257324, "semantic_entropy": 0.0040356675162911415, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 17.79215643845262, "learning_rate": 8.506230671236254e-07, "logits/chosen": 0.7882435917854309, "logits/rejected": 0.8229848146438599, "logps/chosen": -8.396721839904785, "logps/rejected": -8.957392692565918, "loss": 0.5884, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.396721839904785, "rewards/margins": 0.5606712102890015, "rewards/rejected": -8.957392692565918, "semantic_entropy": 0.0036423238925635815, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 14.625489667473667, "learning_rate": 8.495110657042488e-07, "logits/chosen": 0.8869732618331909, "logits/rejected": 0.9640114903450012, "logps/chosen": -8.529989242553711, "logps/rejected": -9.27656364440918, "loss": 0.5253, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.529989242553711, "rewards/margins": 0.7465731501579285, "rewards/rejected": -9.27656364440918, "semantic_entropy": 0.0031088325195014477, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 18.188969568475773, "learning_rate": 8.483956733269799e-07, "logits/chosen": 0.8915464282035828, "logits/rejected": 0.9507058262825012, "logps/chosen": -8.5012845993042, "logps/rejected": -9.236323356628418, "loss": 0.5562, "rewards/accuracies": 0.71875, "rewards/chosen": -8.5012845993042, "rewards/margins": 0.7350392937660217, "rewards/rejected": -9.236323356628418, "semantic_entropy": 0.002896857215091586, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 17.9922914014973, "learning_rate": 8.472769008133602e-07, "logits/chosen": 0.8699381947517395, "logits/rejected": 0.953484833240509, "logps/chosen": -8.702693939208984, "logps/rejected": -9.392245292663574, "loss": 0.5587, "rewards/accuracies": 0.6875, "rewards/chosen": -8.702693939208984, "rewards/margins": 0.6895512342453003, "rewards/rejected": -9.392245292663574, "semantic_entropy": 0.00236605666577816, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 15.27885989370606, "learning_rate": 8.461547590177259e-07, "logits/chosen": 0.9513596296310425, "logits/rejected": 1.0096721649169922, "logps/chosen": -8.60840129852295, "logps/rejected": -9.362217903137207, "loss": 0.5943, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.60840129852295, "rewards/margins": 0.7538172006607056, "rewards/rejected": -9.362217903137207, "semantic_entropy": 0.0031571455765515566, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 16.3301144819492, "learning_rate": 8.450292588271014e-07, "logits/chosen": 0.9395162463188171, "logits/rejected": 0.9991067051887512, "logps/chosen": -8.834760665893555, "logps/rejected": -9.530847549438477, "loss": 0.5574, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.834760665893555, "rewards/margins": 0.696088433265686, "rewards/rejected": -9.530847549438477, "semantic_entropy": 0.0025766813196241856, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 14.909090646149107, "learning_rate": 8.439004111610945e-07, "logits/chosen": 0.9531529545783997, "logits/rejected": 0.9883922338485718, "logps/chosen": -8.549825668334961, "logps/rejected": -9.276273727416992, "loss": 0.5704, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.549825668334961, "rewards/margins": 0.7264472246170044, "rewards/rejected": -9.276273727416992, "semantic_entropy": 0.003057825844734907, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 14.551680179512683, "learning_rate": 8.427682269717901e-07, "logits/chosen": 0.918908953666687, "logits/rejected": 0.9724335670471191, "logps/chosen": -8.606492042541504, "logps/rejected": -9.455463409423828, "loss": 0.495, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.606492042541504, "rewards/margins": 0.8489717245101929, "rewards/rejected": -9.455463409423828, "semantic_entropy": 0.002914209384471178, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 17.126300719381895, "learning_rate": 8.416327172436446e-07, "logits/chosen": 0.9382045865058899, "logits/rejected": 1.0026956796646118, "logps/chosen": -8.67430591583252, "logps/rejected": -9.235125541687012, "loss": 0.5954, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -8.67430591583252, "rewards/margins": 0.5608205795288086, "rewards/rejected": -9.235125541687012, "semantic_entropy": 0.0024842366110533476, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 12.526549153278241, "learning_rate": 8.404938929933778e-07, "logits/chosen": 0.9702759981155396, "logits/rejected": 1.0298935174942017, "logps/chosen": -8.516494750976562, "logps/rejected": -9.477919578552246, "loss": 0.4751, "rewards/accuracies": 0.78125, "rewards/chosen": -8.516494750976562, "rewards/margins": 0.961426854133606, "rewards/rejected": -9.477919578552246, "semantic_entropy": 0.0030000859405845404, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 13.104074462534614, "learning_rate": 8.39351765269868e-07, "logits/chosen": 0.9352337121963501, "logits/rejected": 0.9791328310966492, "logps/chosen": -8.481460571289062, "logps/rejected": -9.137152671813965, "loss": 0.5852, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.481460571289062, "rewards/margins": 0.6556928157806396, "rewards/rejected": -9.137152671813965, "semantic_entropy": 0.0034713305067270994, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 15.985094320283427, "learning_rate": 8.382063451540431e-07, "logits/chosen": 0.9075764417648315, "logits/rejected": 1.0042946338653564, "logps/chosen": -8.522588729858398, "logps/rejected": -9.330839157104492, "loss": 0.4979, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.522588729858398, "rewards/margins": 0.8082510828971863, "rewards/rejected": -9.330839157104492, "semantic_entropy": 0.0028249945025891066, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 12.915214195374903, "learning_rate": 8.370576437587742e-07, "logits/chosen": 0.8950628042221069, "logits/rejected": 0.924281120300293, "logps/chosen": -8.34730052947998, "logps/rejected": -9.118095397949219, "loss": 0.5147, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.34730052947998, "rewards/margins": 0.7707957625389099, "rewards/rejected": -9.118095397949219, "semantic_entropy": 0.0034280649852007627, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 14.453317743532807, "learning_rate": 8.359056722287674e-07, "logits/chosen": 0.8320645093917847, "logits/rejected": 0.9602710604667664, "logps/chosen": -8.307376861572266, "logps/rejected": -9.152776718139648, "loss": 0.5081, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.307376861572266, "rewards/margins": 0.845400333404541, "rewards/rejected": -9.152776718139648, "semantic_entropy": 0.0037023150362074375, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 13.51505578799361, "learning_rate": 8.347504417404553e-07, "logits/chosen": 0.8122785687446594, "logits/rejected": 0.8963130712509155, "logps/chosen": -8.318288803100586, "logps/rejected": -9.081267356872559, "loss": 0.536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.318288803100586, "rewards/margins": 0.7629793882369995, "rewards/rejected": -9.081267356872559, "semantic_entropy": 0.003525532316416502, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 11.191552343163695, "learning_rate": 8.335919635018893e-07, "logits/chosen": 0.7361363172531128, "logits/rejected": 0.8019342422485352, "logps/chosen": -8.195878982543945, "logps/rejected": -8.886285781860352, "loss": 0.5351, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.195878982543945, "rewards/margins": 0.6904064416885376, "rewards/rejected": -8.886285781860352, "semantic_entropy": 0.004457551054656506, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 12.647769499958207, "learning_rate": 8.324302487526303e-07, "logits/chosen": 0.7044271230697632, "logits/rejected": 0.77665114402771, "logps/chosen": -8.380681037902832, "logps/rejected": -9.184738159179688, "loss": 0.5034, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.380681037902832, "rewards/margins": 0.8040567636489868, "rewards/rejected": -9.184738159179688, "semantic_entropy": 0.003701858688145876, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 13.95644498738611, "learning_rate": 8.312653087636398e-07, "logits/chosen": 0.722461998462677, "logits/rejected": 0.7603663206100464, "logps/chosen": -8.306981086730957, "logps/rejected": -9.107414245605469, "loss": 0.5306, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.306981086730957, "rewards/margins": 0.8004336357116699, "rewards/rejected": -9.107414245605469, "semantic_entropy": 0.004512041341513395, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 18.459332801354297, "learning_rate": 8.300971548371711e-07, "logits/chosen": 0.5903456211090088, "logits/rejected": 0.7183451056480408, "logps/chosen": -8.510897636413574, "logps/rejected": -9.23041820526123, "loss": 0.5363, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.510897636413574, "rewards/margins": 0.719520628452301, "rewards/rejected": -9.23041820526123, "semantic_entropy": 0.003166732145473361, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 18.340027693624933, "learning_rate": 8.289257983066582e-07, "logits/chosen": 0.6703733205795288, "logits/rejected": 0.7412980198860168, "logps/chosen": -8.390009880065918, "logps/rejected": -9.196538925170898, "loss": 0.5283, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.390009880065918, "rewards/margins": 0.8065292239189148, "rewards/rejected": -9.196538925170898, "semantic_entropy": 0.004317262209951878, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 14.802116730649294, "learning_rate": 8.277512505366077e-07, "logits/chosen": 0.6543900966644287, "logits/rejected": 0.7763436436653137, "logps/chosen": -8.447009086608887, "logps/rejected": -9.306153297424316, "loss": 0.5238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.447009086608887, "rewards/margins": 0.8591440916061401, "rewards/rejected": -9.306153297424316, "semantic_entropy": 0.003334530396386981, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 15.098311821125256, "learning_rate": 8.265735229224868e-07, "logits/chosen": 0.6818082928657532, "logits/rejected": 0.7659986615180969, "logps/chosen": -8.268132209777832, "logps/rejected": -9.30879020690918, "loss": 0.4737, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.268132209777832, "rewards/margins": 1.040657877922058, "rewards/rejected": -9.30879020690918, "semantic_entropy": 0.003615723457187414, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 14.723077590553624, "learning_rate": 8.253926268906144e-07, "logits/chosen": 0.6228159666061401, "logits/rejected": 0.688185453414917, "logps/chosen": -8.453712463378906, "logps/rejected": -9.435036659240723, "loss": 0.4647, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.453712463378906, "rewards/margins": 0.9813230633735657, "rewards/rejected": -9.435036659240723, "semantic_entropy": 0.0034209657460451126, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 13.627077288321304, "learning_rate": 8.242085738980487e-07, "logits/chosen": 0.7107739448547363, "logits/rejected": 0.8460835218429565, "logps/chosen": -8.611701965332031, "logps/rejected": -9.478960990905762, "loss": 0.5379, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.611701965332031, "rewards/margins": 0.8672583699226379, "rewards/rejected": -9.478960990905762, "semantic_entropy": 0.0030593627598136663, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 19.757986263536445, "learning_rate": 8.230213754324772e-07, "logits/chosen": 0.6388633847236633, "logits/rejected": 0.6890886425971985, "logps/chosen": -8.633177757263184, "logps/rejected": -9.407966613769531, "loss": 0.51, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.633177757263184, "rewards/margins": 0.7747882604598999, "rewards/rejected": -9.407966613769531, "semantic_entropy": 0.002916950499638915, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 16.381811199534628, "learning_rate": 8.218310430121045e-07, "logits/chosen": 0.6950886845588684, "logits/rejected": 0.720539927482605, "logps/chosen": -8.707314491271973, "logps/rejected": -9.449724197387695, "loss": 0.558, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.707314491271973, "rewards/margins": 0.74241042137146, "rewards/rejected": -9.449724197387695, "semantic_entropy": 0.0032343785278499126, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 13.044302764643096, "learning_rate": 8.20637588185541e-07, "logits/chosen": 0.6273518800735474, "logits/rejected": 0.6855801939964294, "logps/chosen": -8.859047889709473, "logps/rejected": -9.979398727416992, "loss": 0.4405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.859047889709473, "rewards/margins": 1.1203503608703613, "rewards/rejected": -9.979398727416992, "semantic_entropy": 0.0030110005754977465, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 16.03642878238039, "learning_rate": 8.194410225316906e-07, "logits/chosen": 0.5873863697052002, "logits/rejected": 0.6804260015487671, "logps/chosen": -8.77137565612793, "logps/rejected": -9.576631546020508, "loss": 0.5406, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.77137565612793, "rewards/margins": 0.8052547574043274, "rewards/rejected": -9.576631546020508, "semantic_entropy": 0.0030320039950311184, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 22.084325288130714, "learning_rate": 8.182413576596385e-07, "logits/chosen": 0.6454890370368958, "logits/rejected": 0.6792441606521606, "logps/chosen": -8.825540542602539, "logps/rejected": -9.584020614624023, "loss": 0.5592, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.825540542602539, "rewards/margins": 0.7584813833236694, "rewards/rejected": -9.584020614624023, "semantic_entropy": 0.0030581161845475435, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 19.767902079352925, "learning_rate": 8.170386052085389e-07, "logits/chosen": 0.5810804963111877, "logits/rejected": 0.6741968989372253, "logps/chosen": -8.81079387664795, "logps/rejected": -9.655781745910645, "loss": 0.5413, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.81079387664795, "rewards/margins": 0.8449875712394714, "rewards/rejected": -9.655781745910645, "semantic_entropy": 0.0033272195141762495, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 16.00497801864881, "learning_rate": 8.158327768475008e-07, "logits/chosen": 0.5558470487594604, "logits/rejected": 0.6562130451202393, "logps/chosen": -8.712007522583008, "logps/rejected": -9.437994003295898, "loss": 0.566, "rewards/accuracies": 0.71875, "rewards/chosen": -8.712007522583008, "rewards/margins": 0.7259871959686279, "rewards/rejected": -9.437994003295898, "semantic_entropy": 0.004049594048410654, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 22.86662817638401, "learning_rate": 8.146238842754767e-07, "logits/chosen": 0.48547202348709106, "logits/rejected": 0.5629103779792786, "logps/chosen": -8.967915534973145, "logps/rejected": -9.580000877380371, "loss": 0.5851, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.967915534973145, "rewards/margins": 0.6120861172676086, "rewards/rejected": -9.580000877380371, "semantic_entropy": 0.002668407978489995, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 23.23762612453471, "learning_rate": 8.134119392211476e-07, "logits/chosen": 0.5937298536300659, "logits/rejected": 0.7143954038619995, "logps/chosen": -8.771955490112305, "logps/rejected": -9.69508171081543, "loss": 0.5095, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.771955490112305, "rewards/margins": 0.9231254458427429, "rewards/rejected": -9.69508171081543, "semantic_entropy": 0.002983763115480542, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 20.894254936050405, "learning_rate": 8.121969534428094e-07, "logits/chosen": 0.5293421745300293, "logits/rejected": 0.6653727293014526, "logps/chosen": -8.856660842895508, "logps/rejected": -9.56396198272705, "loss": 0.5834, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.856660842895508, "rewards/margins": 0.7073008418083191, "rewards/rejected": -9.56396198272705, "semantic_entropy": 0.002809601603075862, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.7784110307693481, "eval_logits/rejected": 0.8619949221611023, "eval_logps/chosen": -8.725646018981934, "eval_logps/rejected": -9.51314926147461, "eval_loss": 0.5343691110610962, "eval_rewards/accuracies": 0.7158753871917725, "eval_rewards/chosen": -8.725646018981934, "eval_rewards/margins": 0.7875038385391235, "eval_rewards/rejected": -9.51314926147461, "eval_runtime": 34.7505, "eval_samples_per_second": 38.704, "eval_semantic_entropy": 0.00273532303981483, "eval_steps_per_second": 9.698, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 16.92593879734535, "learning_rate": 8.109789387282599e-07, "logits/chosen": 0.5764074921607971, "logits/rejected": 0.6182007193565369, "logps/chosen": -8.691645622253418, "logps/rejected": -9.37775707244873, "loss": 0.5661, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.691645622253418, "rewards/margins": 0.6861115097999573, "rewards/rejected": -9.37775707244873, "semantic_entropy": 0.003028175327926874, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 18.60708769092857, "learning_rate": 8.097579068946827e-07, "logits/chosen": 0.5846803784370422, "logits/rejected": 0.6755378842353821, "logps/chosen": -8.488496780395508, "logps/rejected": -9.236639022827148, "loss": 0.5124, "rewards/accuracies": 0.71875, "rewards/chosen": -8.488496780395508, "rewards/margins": 0.7481436729431152, "rewards/rejected": -9.236639022827148, "semantic_entropy": 0.0031055829022079706, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 16.81295395917736, "learning_rate": 8.085338697885344e-07, "logits/chosen": 0.5960233807563782, "logits/rejected": 0.6982234120368958, "logps/chosen": -8.587759017944336, "logps/rejected": -9.308615684509277, "loss": 0.5304, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.587759017944336, "rewards/margins": 0.720857560634613, "rewards/rejected": -9.308615684509277, "semantic_entropy": 0.003149865660816431, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 19.584362200308394, "learning_rate": 8.073068392854282e-07, "logits/chosen": 0.4914863705635071, "logits/rejected": 0.6277307868003845, "logps/chosen": -8.720789909362793, "logps/rejected": -9.521505355834961, "loss": 0.4904, "rewards/accuracies": 0.78125, "rewards/chosen": -8.720789909362793, "rewards/margins": 0.8007165789604187, "rewards/rejected": -9.521505355834961, "semantic_entropy": 0.0029073634650558233, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 14.905704270703776, "learning_rate": 8.060768272900193e-07, "logits/chosen": 0.5698509812355042, "logits/rejected": 0.6809700727462769, "logps/chosen": -8.530683517456055, "logps/rejected": -9.367597579956055, "loss": 0.5261, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.530683517456055, "rewards/margins": 0.8369154930114746, "rewards/rejected": -9.367597579956055, "semantic_entropy": 0.0036535891704261303, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 11.142809207007566, "learning_rate": 8.0484384573589e-07, "logits/chosen": 0.4973204731941223, "logits/rejected": 0.5573136210441589, "logps/chosen": -8.422048568725586, "logps/rejected": -9.212953567504883, "loss": 0.5264, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.422048568725586, "rewards/margins": 0.7909058332443237, "rewards/rejected": -9.212953567504883, "semantic_entropy": 0.003728007199242711, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 18.70125115826522, "learning_rate": 8.03607906585432e-07, "logits/chosen": 0.5369696617126465, "logits/rejected": 0.6485335230827332, "logps/chosen": -8.6622896194458, "logps/rejected": -9.386190414428711, "loss": 0.5708, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.6622896194458, "rewards/margins": 0.7239011526107788, "rewards/rejected": -9.386190414428711, "semantic_entropy": 0.0035473487805575132, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 26.024840250891568, "learning_rate": 8.023690218297329e-07, "logits/chosen": 0.47266292572021484, "logits/rejected": 0.526736855506897, "logps/chosen": -8.556685447692871, "logps/rejected": -9.491031646728516, "loss": 0.4911, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.556685447692871, "rewards/margins": 0.9343463778495789, "rewards/rejected": -9.491031646728516, "semantic_entropy": 0.0032796214800328016, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 18.59073371986523, "learning_rate": 8.01127203488458e-07, "logits/chosen": 0.5553776025772095, "logits/rejected": 0.6106212735176086, "logps/chosen": -8.6795015335083, "logps/rejected": -9.444005012512207, "loss": 0.5309, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.6795015335083, "rewards/margins": 0.7645029425621033, "rewards/rejected": -9.444005012512207, "semantic_entropy": 0.0029942230321466923, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 19.661856792543684, "learning_rate": 7.998824636097339e-07, "logits/chosen": 0.5739470720291138, "logits/rejected": 0.6971379518508911, "logps/chosen": -8.599574089050293, "logps/rejected": -9.433286666870117, "loss": 0.5109, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.599574089050293, "rewards/margins": 0.833710789680481, "rewards/rejected": -9.433286666870117, "semantic_entropy": 0.002899765968322754, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 16.83932620346083, "learning_rate": 7.986348142700328e-07, "logits/chosen": 0.5915915966033936, "logits/rejected": 0.7208577394485474, "logps/chosen": -8.551434516906738, "logps/rejected": -9.56495189666748, "loss": 0.4975, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.551434516906738, "rewards/margins": 1.0135180950164795, "rewards/rejected": -9.56495189666748, "semantic_entropy": 0.004057818092405796, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 19.759575014791515, "learning_rate": 7.973842675740539e-07, "logits/chosen": 0.644290566444397, "logits/rejected": 0.7044304609298706, "logps/chosen": -8.437231063842773, "logps/rejected": -9.365800857543945, "loss": 0.4995, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.437231063842773, "rewards/margins": 0.928569495677948, "rewards/rejected": -9.365800857543945, "semantic_entropy": 0.00467148469761014, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 19.3959994469509, "learning_rate": 7.961308356546066e-07, "logits/chosen": 0.5765253305435181, "logits/rejected": 0.7118976712226868, "logps/chosen": -8.473932266235352, "logps/rejected": -9.513734817504883, "loss": 0.4958, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.473932266235352, "rewards/margins": 1.0398019552230835, "rewards/rejected": -9.513734817504883, "semantic_entropy": 0.003963841591030359, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 19.393093668750392, "learning_rate": 7.948745306724931e-07, "logits/chosen": 0.6232589483261108, "logits/rejected": 0.7551737427711487, "logps/chosen": -8.12829875946045, "logps/rejected": -9.182195663452148, "loss": 0.4412, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.12829875946045, "rewards/margins": 1.0538949966430664, "rewards/rejected": -9.182195663452148, "semantic_entropy": 0.004817788954824209, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 23.64668063780086, "learning_rate": 7.936153648163897e-07, "logits/chosen": 0.5677531957626343, "logits/rejected": 0.6550413966178894, "logps/chosen": -8.326519966125488, "logps/rejected": -9.14603042602539, "loss": 0.5172, "rewards/accuracies": 0.71875, "rewards/chosen": -8.326519966125488, "rewards/margins": 0.8195114135742188, "rewards/rejected": -9.14603042602539, "semantic_entropy": 0.0040381476283073425, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 19.95159503167207, "learning_rate": 7.92353350302729e-07, "logits/chosen": 0.5089942216873169, "logits/rejected": 0.6331297159194946, "logps/chosen": -8.021492004394531, "logps/rejected": -8.944520950317383, "loss": 0.5098, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.021492004394531, "rewards/margins": 0.9230290651321411, "rewards/rejected": -8.944520950317383, "semantic_entropy": 0.005194402299821377, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 25.2963455688314, "learning_rate": 7.910884993755816e-07, "logits/chosen": 0.6509027481079102, "logits/rejected": 0.7161253690719604, "logps/chosen": -8.10318660736084, "logps/rejected": -9.13819408416748, "loss": 0.4955, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.10318660736084, "rewards/margins": 1.0350077152252197, "rewards/rejected": -9.13819408416748, "semantic_entropy": 0.004798793233931065, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 16.477568206890176, "learning_rate": 7.898208243065367e-07, "logits/chosen": 0.6596091389656067, "logits/rejected": 0.6896553635597229, "logps/chosen": -8.11032772064209, "logps/rejected": -8.861970901489258, "loss": 0.533, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.11032772064209, "rewards/margins": 0.7516436576843262, "rewards/rejected": -8.861970901489258, "semantic_entropy": 0.004430143162608147, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 16.367388918717808, "learning_rate": 7.88550337394583e-07, "logits/chosen": 0.640828013420105, "logits/rejected": 0.7348512411117554, "logps/chosen": -8.398119926452637, "logps/rejected": -9.17949104309082, "loss": 0.5304, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.398119926452637, "rewards/margins": 0.7813706398010254, "rewards/rejected": -9.17949104309082, "semantic_entropy": 0.0035932317841798067, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 22.724539658375086, "learning_rate": 7.872770509659905e-07, "logits/chosen": 0.7362472414970398, "logits/rejected": 0.7698075771331787, "logps/chosen": -8.4552583694458, "logps/rejected": -9.216978073120117, "loss": 0.5361, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.4552583694458, "rewards/margins": 0.7617195844650269, "rewards/rejected": -9.216978073120117, "semantic_entropy": 0.003288673236966133, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 17.17232989165224, "learning_rate": 7.860009773741896e-07, "logits/chosen": 0.8084769248962402, "logits/rejected": 0.9146261215209961, "logps/chosen": -8.417569160461426, "logps/rejected": -9.373042106628418, "loss": 0.4631, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.417569160461426, "rewards/margins": 0.9554733037948608, "rewards/rejected": -9.373042106628418, "semantic_entropy": 0.0028464009519666433, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 17.323667244270915, "learning_rate": 7.84722128999652e-07, "logits/chosen": 0.767966091632843, "logits/rejected": 0.8326314687728882, "logps/chosen": -8.67313003540039, "logps/rejected": -9.694366455078125, "loss": 0.4904, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.67313003540039, "rewards/margins": 1.0212359428405762, "rewards/rejected": -9.694366455078125, "semantic_entropy": 0.0024180663749575615, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 18.88990514293712, "learning_rate": 7.834405182497699e-07, "logits/chosen": 0.8208998441696167, "logits/rejected": 0.8627697229385376, "logps/chosen": -8.815618515014648, "logps/rejected": -9.641824722290039, "loss": 0.5307, "rewards/accuracies": 0.75, "rewards/chosen": -8.815618515014648, "rewards/margins": 0.826204776763916, "rewards/rejected": -9.641824722290039, "semantic_entropy": 0.0024063908495008945, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 22.075179293885157, "learning_rate": 7.821561575587368e-07, "logits/chosen": 0.772208571434021, "logits/rejected": 0.8185374140739441, "logps/chosen": -8.663274765014648, "logps/rejected": -9.404105186462402, "loss": 0.5304, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.663274765014648, "rewards/margins": 0.7408307790756226, "rewards/rejected": -9.404105186462402, "semantic_entropy": 0.0030980452429503202, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 16.278122487967558, "learning_rate": 7.808690593874254e-07, "logits/chosen": 0.745190441608429, "logits/rejected": 0.8001850247383118, "logps/chosen": -8.882969856262207, "logps/rejected": -9.755064010620117, "loss": 0.5305, "rewards/accuracies": 0.6875, "rewards/chosen": -8.882969856262207, "rewards/margins": 0.872094452381134, "rewards/rejected": -9.755064010620117, "semantic_entropy": 0.0022930700797587633, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 20.293385998629976, "learning_rate": 7.79579236223268e-07, "logits/chosen": 0.8587775230407715, "logits/rejected": 0.9622253179550171, "logps/chosen": -8.695469856262207, "logps/rejected": -9.686747550964355, "loss": 0.4915, "rewards/accuracies": 0.75, "rewards/chosen": -8.695469856262207, "rewards/margins": 0.991279125213623, "rewards/rejected": -9.686747550964355, "semantic_entropy": 0.0027502470184117556, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 20.703488040967667, "learning_rate": 7.782867005801346e-07, "logits/chosen": 0.765255868434906, "logits/rejected": 0.8885319828987122, "logps/chosen": -8.589404106140137, "logps/rejected": -9.656288146972656, "loss": 0.4855, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.589404106140137, "rewards/margins": 1.0668823719024658, "rewards/rejected": -9.656288146972656, "semantic_entropy": 0.0031393137760460377, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 23.797314324550555, "learning_rate": 7.769914649982117e-07, "logits/chosen": 0.8055821657180786, "logits/rejected": 0.8668516874313354, "logps/chosen": -8.526637077331543, "logps/rejected": -9.445337295532227, "loss": 0.4964, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.526637077331543, "rewards/margins": 0.9186998605728149, "rewards/rejected": -9.445337295532227, "semantic_entropy": 0.003491030540317297, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 15.411161024417602, "learning_rate": 7.756935420438803e-07, "logits/chosen": 0.8090022206306458, "logits/rejected": 0.8830445408821106, "logps/chosen": -8.554264068603516, "logps/rejected": -9.823812484741211, "loss": 0.4472, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.554264068603516, "rewards/margins": 1.269547462463379, "rewards/rejected": -9.823812484741211, "semantic_entropy": 0.0031209487933665514, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 16.571465015989386, "learning_rate": 7.743929443095951e-07, "logits/chosen": 0.773921549320221, "logits/rejected": 0.8259444236755371, "logps/chosen": -8.577144622802734, "logps/rejected": -9.52406120300293, "loss": 0.4723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.577144622802734, "rewards/margins": 0.9469181895256042, "rewards/rejected": -9.52406120300293, "semantic_entropy": 0.0030519163701683283, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 19.150879536804933, "learning_rate": 7.730896844137609e-07, "logits/chosen": 0.7496171593666077, "logits/rejected": 0.8101975321769714, "logps/chosen": -8.777600288391113, "logps/rejected": -9.443353652954102, "loss": 0.5967, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -8.777600288391113, "rewards/margins": 0.6657532453536987, "rewards/rejected": -9.443353652954102, "semantic_entropy": 0.002408596221357584, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 20.084289617276934, "learning_rate": 7.717837750006106e-07, "logits/chosen": 0.7996751666069031, "logits/rejected": 0.8572956919670105, "logps/chosen": -8.599963188171387, "logps/rejected": -9.580442428588867, "loss": 0.5205, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.599963188171387, "rewards/margins": 0.9804786443710327, "rewards/rejected": -9.580442428588867, "semantic_entropy": 0.0031619679648429155, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 19.904342457753952, "learning_rate": 7.704752287400832e-07, "logits/chosen": 0.7399067282676697, "logits/rejected": 0.8611429333686829, "logps/chosen": -8.674205780029297, "logps/rejected": -9.685873985290527, "loss": 0.5023, "rewards/accuracies": 0.78125, "rewards/chosen": -8.674205780029297, "rewards/margins": 1.0116674900054932, "rewards/rejected": -9.685873985290527, "semantic_entropy": 0.0029065976850688457, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 11.68066385401199, "learning_rate": 7.691640583277004e-07, "logits/chosen": 0.8236852884292603, "logits/rejected": 0.8967201113700867, "logps/chosen": -8.778889656066895, "logps/rejected": -9.750692367553711, "loss": 0.5151, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.778889656066895, "rewards/margins": 0.9718036651611328, "rewards/rejected": -9.750692367553711, "semantic_entropy": 0.0026112559717148542, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 13.781442467832825, "learning_rate": 7.678502764844433e-07, "logits/chosen": 0.7699551582336426, "logits/rejected": 0.8938447833061218, "logps/chosen": -8.977490425109863, "logps/rejected": -9.788980484008789, "loss": 0.5165, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.977490425109863, "rewards/margins": 0.8114897012710571, "rewards/rejected": -9.788980484008789, "semantic_entropy": 0.0019960529170930386, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 14.44343492374134, "learning_rate": 7.665338959566288e-07, "logits/chosen": 0.8235516548156738, "logits/rejected": 0.8966760635375977, "logps/chosen": -9.135007858276367, "logps/rejected": -10.119426727294922, "loss": 0.4607, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.135007858276367, "rewards/margins": 0.9844182133674622, "rewards/rejected": -10.119426727294922, "semantic_entropy": 0.0018916798289865255, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 17.800127012280676, "learning_rate": 7.652149295157868e-07, "logits/chosen": 0.8629690408706665, "logits/rejected": 0.9295064210891724, "logps/chosen": -9.345608711242676, "logps/rejected": -10.031997680664062, "loss": 0.5446, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.345608711242676, "rewards/margins": 0.6863887906074524, "rewards/rejected": -10.031997680664062, "semantic_entropy": 0.0015552560798823833, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 22.042952861910784, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.9068318605422974, "logits/rejected": 0.924595832824707, "logps/chosen": -9.173646926879883, "logps/rejected": -10.000932693481445, "loss": 0.5156, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.173646926879883, "rewards/margins": 0.8272865414619446, "rewards/rejected": -10.000932693481445, "semantic_entropy": 0.0017204980831593275, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 18.715280386882995, "learning_rate": 7.625692901064573e-07, "logits/chosen": 0.8207842707633972, "logits/rejected": 0.9047282934188843, "logps/chosen": -9.131489753723145, "logps/rejected": -9.951040267944336, "loss": 0.5302, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -9.131489753723145, "rewards/margins": 0.8195503950119019, "rewards/rejected": -9.951040267944336, "semantic_entropy": 0.0020564752630889416, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 14.79847154131496, "learning_rate": 7.61242642805975e-07, "logits/chosen": 0.8447543382644653, "logits/rejected": 0.8663375973701477, "logps/chosen": -9.094636917114258, "logps/rejected": -9.878302574157715, "loss": 0.534, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.094636917114258, "rewards/margins": 0.7836667895317078, "rewards/rejected": -9.878302574157715, "semantic_entropy": 0.0019203886622563004, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 17.15954179734088, "learning_rate": 7.599134609282266e-07, "logits/chosen": 0.7871206998825073, "logits/rejected": 0.8676565289497375, "logps/chosen": -9.28339672088623, "logps/rejected": -10.069405555725098, "loss": 0.5129, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -9.28339672088623, "rewards/margins": 0.7860093712806702, "rewards/rejected": -10.069405555725098, "semantic_entropy": 0.001820198493078351, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 24.404592657138192, "learning_rate": 7.585817573689402e-07, "logits/chosen": 0.7938421368598938, "logits/rejected": 0.8801844716072083, "logps/chosen": -8.840426445007324, "logps/rejected": -9.785604476928711, "loss": 0.4784, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.840426445007324, "rewards/margins": 0.9451776742935181, "rewards/rejected": -9.785604476928711, "semantic_entropy": 0.002649650676175952, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 17.132762998778745, "learning_rate": 7.572475450483098e-07, "logits/chosen": 0.7745561003684998, "logits/rejected": 0.8122493624687195, "logps/chosen": -8.980504035949707, "logps/rejected": -9.769770622253418, "loss": 0.5316, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.980504035949707, "rewards/margins": 0.7892670035362244, "rewards/rejected": -9.769770622253418, "semantic_entropy": 0.0022570898290723562, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 20.513738139152867, "learning_rate": 7.559108369108689e-07, "logits/chosen": 0.7253280878067017, "logits/rejected": 0.7848079204559326, "logps/chosen": -8.66881275177002, "logps/rejected": -9.506206512451172, "loss": 0.5316, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.66881275177002, "rewards/margins": 0.8373939394950867, "rewards/rejected": -9.506206512451172, "semantic_entropy": 0.0028397340793162584, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 13.082835254565163, "learning_rate": 7.54571645925366e-07, "logits/chosen": 0.6793020367622375, "logits/rejected": 0.8425678014755249, "logps/chosen": -8.629182815551758, "logps/rejected": -9.746764183044434, "loss": 0.4487, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.629182815551758, "rewards/margins": 1.1175806522369385, "rewards/rejected": -9.746764183044434, "semantic_entropy": 0.003014157759025693, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 15.319269039008326, "learning_rate": 7.532299850846378e-07, "logits/chosen": 0.6559053659439087, "logits/rejected": 0.7742191553115845, "logps/chosen": -8.408263206481934, "logps/rejected": -9.492993354797363, "loss": 0.4948, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.408263206481934, "rewards/margins": 1.084729790687561, "rewards/rejected": -9.492993354797363, "semantic_entropy": 0.0036600581370294094, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 22.81260636310479, "learning_rate": 7.518858674054838e-07, "logits/chosen": 0.6717875003814697, "logits/rejected": 0.8029670715332031, "logps/chosen": -8.644887924194336, "logps/rejected": -9.598337173461914, "loss": 0.5115, "rewards/accuracies": 0.75, "rewards/chosen": -8.644887924194336, "rewards/margins": 0.9534481763839722, "rewards/rejected": -9.598337173461914, "semantic_entropy": 0.002926050452515483, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 17.071928193449306, "learning_rate": 7.505393059285394e-07, "logits/chosen": 0.6294863224029541, "logits/rejected": 0.7521852254867554, "logps/chosen": -8.822403907775879, "logps/rejected": -9.73637866973877, "loss": 0.5241, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.822403907775879, "rewards/margins": 0.9139748811721802, "rewards/rejected": -9.73637866973877, "semantic_entropy": 0.003058222122490406, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 21.59100967195749, "learning_rate": 7.491903137181501e-07, "logits/chosen": 0.6673406362533569, "logits/rejected": 0.6980730295181274, "logps/chosen": -8.757534980773926, "logps/rejected": -9.63608169555664, "loss": 0.4955, "rewards/accuracies": 0.78125, "rewards/chosen": -8.757534980773926, "rewards/margins": 0.8785461187362671, "rewards/rejected": -9.63608169555664, "semantic_entropy": 0.003113445593044162, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 17.12424676715307, "learning_rate": 7.478389038622441e-07, "logits/chosen": 0.6984297633171082, "logits/rejected": 0.7338518500328064, "logps/chosen": -8.893332481384277, "logps/rejected": -9.793367385864258, "loss": 0.527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.893332481384277, "rewards/margins": 0.9000345468521118, "rewards/rejected": -9.793367385864258, "semantic_entropy": 0.002758896443992853, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 26.22447421233564, "learning_rate": 7.46485089472206e-07, "logits/chosen": 0.6646834015846252, "logits/rejected": 0.7228942513465881, "logps/chosen": -8.950407028198242, "logps/rejected": -9.782625198364258, "loss": 0.5624, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.950407028198242, "rewards/margins": 0.8322180509567261, "rewards/rejected": -9.782625198364258, "semantic_entropy": 0.0024748151190578938, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 17.893944580761662, "learning_rate": 7.451288836827487e-07, "logits/chosen": 0.7343819737434387, "logits/rejected": 0.763200044631958, "logps/chosen": -8.684735298156738, "logps/rejected": -9.369165420532227, "loss": 0.5689, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.684735298156738, "rewards/margins": 0.6844292283058167, "rewards/rejected": -9.369165420532227, "semantic_entropy": 0.003174789249897003, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 15.807684902195147, "learning_rate": 7.437702996517869e-07, "logits/chosen": 0.6750258207321167, "logits/rejected": 0.7452644109725952, "logps/chosen": -8.592541694641113, "logps/rejected": -9.485219955444336, "loss": 0.5089, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.592541694641113, "rewards/margins": 0.8926795721054077, "rewards/rejected": -9.485219955444336, "semantic_entropy": 0.0034456239081919193, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 18.6126390663135, "learning_rate": 7.424093505603087e-07, "logits/chosen": 0.6281952857971191, "logits/rejected": 0.7401161789894104, "logps/chosen": -8.631272315979004, "logps/rejected": -9.659812927246094, "loss": 0.4665, "rewards/accuracies": 0.78125, "rewards/chosen": -8.631272315979004, "rewards/margins": 1.0285407304763794, "rewards/rejected": -9.659812927246094, "semantic_entropy": 0.0035267819184809923, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 18.495386475386976, "learning_rate": 7.410460496122482e-07, "logits/chosen": 0.6814571619033813, "logits/rejected": 0.7883174419403076, "logps/chosen": -8.451251983642578, "logps/rejected": -9.589815139770508, "loss": 0.4347, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.451251983642578, "rewards/margins": 1.1385620832443237, "rewards/rejected": -9.589815139770508, "semantic_entropy": 0.0035903877578675747, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 20.866364773443532, "learning_rate": 7.396804100343572e-07, "logits/chosen": 0.6894387602806091, "logits/rejected": 0.7951668500900269, "logps/chosen": -8.350536346435547, "logps/rejected": -9.263737678527832, "loss": 0.492, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.350536346435547, "rewards/margins": 0.9132000207901001, "rewards/rejected": -9.263737678527832, "semantic_entropy": 0.003823335049673915, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 11.808941649198584, "learning_rate": 7.383124450760768e-07, "logits/chosen": 0.7374765276908875, "logits/rejected": 0.8545964956283569, "logps/chosen": -8.481078147888184, "logps/rejected": -9.478879928588867, "loss": 0.4777, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.481078147888184, "rewards/margins": 0.9978022575378418, "rewards/rejected": -9.478879928588867, "semantic_entropy": 0.003645769553259015, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 17.775252169557806, "learning_rate": 7.369421680094091e-07, "logits/chosen": 0.6624468564987183, "logits/rejected": 0.7552576661109924, "logps/chosen": -8.490701675415039, "logps/rejected": -9.470184326171875, "loss": 0.5227, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.490701675415039, "rewards/margins": 0.9794837832450867, "rewards/rejected": -9.470184326171875, "semantic_entropy": 0.0034333504736423492, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 23.97776160790946, "learning_rate": 7.355695921287881e-07, "logits/chosen": 0.6835793256759644, "logits/rejected": 0.7225985527038574, "logps/chosen": -8.687114715576172, "logps/rejected": -9.38883113861084, "loss": 0.6041, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.687114715576172, "rewards/margins": 0.7017166018486023, "rewards/rejected": -9.38883113861084, "semantic_entropy": 0.003160933731123805, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 21.69249472023334, "learning_rate": 7.341947307509513e-07, "logits/chosen": 0.7158384919166565, "logits/rejected": 0.8074569702148438, "logps/chosen": -8.510350227355957, "logps/rejected": -9.450441360473633, "loss": 0.5061, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.510350227355957, "rewards/margins": 0.940090537071228, "rewards/rejected": -9.450441360473633, "semantic_entropy": 0.00320886867120862, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 17.13076671528079, "learning_rate": 7.328175972148094e-07, "logits/chosen": 0.7047310471534729, "logits/rejected": 0.7689910531044006, "logps/chosen": -8.937776565551758, "logps/rejected": -9.842035293579102, "loss": 0.5066, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.937776565551758, "rewards/margins": 0.9042595624923706, "rewards/rejected": -9.842035293579102, "semantic_entropy": 0.0021909018978476524, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 22.05914195430034, "learning_rate": 7.314382048813185e-07, "logits/chosen": 0.7231523394584656, "logits/rejected": 0.8367801904678345, "logps/chosen": -8.771172523498535, "logps/rejected": -9.783547401428223, "loss": 0.4775, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.771172523498535, "rewards/margins": 1.0123744010925293, "rewards/rejected": -9.783547401428223, "semantic_entropy": 0.0027366154827177525, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 15.488052133555222, "learning_rate": 7.300565671333486e-07, "logits/chosen": 0.6668115854263306, "logits/rejected": 0.7803434133529663, "logps/chosen": -8.952492713928223, "logps/rejected": -9.73788070678711, "loss": 0.5417, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.952492713928223, "rewards/margins": 0.7853885293006897, "rewards/rejected": -9.73788070678711, "semantic_entropy": 0.002661502454429865, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 15.301361368412175, "learning_rate": 7.286726973755554e-07, "logits/chosen": 0.7436283826828003, "logits/rejected": 0.783458411693573, "logps/chosen": -8.722562789916992, "logps/rejected": -9.623026847839355, "loss": 0.4961, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.722562789916992, "rewards/margins": 0.9004641771316528, "rewards/rejected": -9.623026847839355, "semantic_entropy": 0.0026164718437939882, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 18.344895783311472, "learning_rate": 7.272866090342493e-07, "logits/chosen": 0.7868816256523132, "logits/rejected": 0.8121258020401001, "logps/chosen": -8.369720458984375, "logps/rejected": -9.339384078979492, "loss": 0.4349, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.369720458984375, "rewards/margins": 0.9696633219718933, "rewards/rejected": -9.339384078979492, "semantic_entropy": 0.004334195517003536, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 20.284036778511826, "learning_rate": 7.258983155572656e-07, "logits/chosen": 0.662312388420105, "logits/rejected": 0.7393311262130737, "logps/chosen": -8.260697364807129, "logps/rejected": -9.110601425170898, "loss": 0.5587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.260697364807129, "rewards/margins": 0.8499045372009277, "rewards/rejected": -9.110601425170898, "semantic_entropy": 0.0039521572180092335, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 13.687544225959545, "learning_rate": 7.245078304138335e-07, "logits/chosen": 0.695865273475647, "logits/rejected": 0.759333074092865, "logps/chosen": -8.318536758422852, "logps/rejected": -9.270764350891113, "loss": 0.4915, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.318536758422852, "rewards/margins": 0.9522277116775513, "rewards/rejected": -9.270764350891113, "semantic_entropy": 0.003750443458557129, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 14.99477345835548, "learning_rate": 7.231151670944462e-07, "logits/chosen": 0.5629149079322815, "logits/rejected": 0.659963846206665, "logps/chosen": -8.367746353149414, "logps/rejected": -9.242141723632812, "loss": 0.5076, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.367746353149414, "rewards/margins": 0.8743956685066223, "rewards/rejected": -9.242141723632812, "semantic_entropy": 0.0034209941513836384, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 14.622457962908232, "learning_rate": 7.217203391107291e-07, "logits/chosen": 0.6555184721946716, "logits/rejected": 0.7649224996566772, "logps/chosen": -8.27055549621582, "logps/rejected": -9.224145889282227, "loss": 0.5084, "rewards/accuracies": 0.78125, "rewards/chosen": -8.27055549621582, "rewards/margins": 0.9535905122756958, "rewards/rejected": -9.224145889282227, "semantic_entropy": 0.0038777173031121492, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 18.63345646684996, "learning_rate": 7.203233599953096e-07, "logits/chosen": 0.6671181917190552, "logits/rejected": 0.7599374651908875, "logps/chosen": -8.387980461120605, "logps/rejected": -9.266815185546875, "loss": 0.4867, "rewards/accuracies": 0.78125, "rewards/chosen": -8.387980461120605, "rewards/margins": 0.8788350820541382, "rewards/rejected": -9.266815185546875, "semantic_entropy": 0.0032208203338086605, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 16.89717027672504, "learning_rate": 7.189242433016852e-07, "logits/chosen": 0.685912013053894, "logits/rejected": 0.7816206812858582, "logps/chosen": -8.186650276184082, "logps/rejected": -9.200610160827637, "loss": 0.4687, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.186650276184082, "rewards/margins": 1.0139598846435547, "rewards/rejected": -9.200610160827637, "semantic_entropy": 0.004348042421042919, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 16.86807483534206, "learning_rate": 7.17523002604092e-07, "logits/chosen": 0.6663065552711487, "logits/rejected": 0.7573191523551941, "logps/chosen": -8.505255699157715, "logps/rejected": -9.44536304473877, "loss": 0.4819, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.505255699157715, "rewards/margins": 0.940106987953186, "rewards/rejected": -9.44536304473877, "semantic_entropy": 0.0034768693149089813, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 17.687247649811177, "learning_rate": 7.161196514973734e-07, "logits/chosen": 0.7061843276023865, "logits/rejected": 0.7796521186828613, "logps/chosen": -8.41321086883545, "logps/rejected": -9.373991012573242, "loss": 0.5037, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.41321086883545, "rewards/margins": 0.9607791900634766, "rewards/rejected": -9.373991012573242, "semantic_entropy": 0.0037853557150810957, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 19.15884763927205, "learning_rate": 7.147142035968483e-07, "logits/chosen": 0.7049607038497925, "logits/rejected": 0.8010439872741699, "logps/chosen": -8.644028663635254, "logps/rejected": -9.527328491210938, "loss": 0.4998, "rewards/accuracies": 0.75, "rewards/chosen": -8.644028663635254, "rewards/margins": 0.8833004832267761, "rewards/rejected": -9.527328491210938, "semantic_entropy": 0.0030520078726112843, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 16.73014781307649, "learning_rate": 7.133066725381781e-07, "logits/chosen": 0.637940526008606, "logits/rejected": 0.7165664434432983, "logps/chosen": -8.474000930786133, "logps/rejected": -9.344751358032227, "loss": 0.5156, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.474000930786133, "rewards/margins": 0.8707484006881714, "rewards/rejected": -9.344751358032227, "semantic_entropy": 0.003291874658316374, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 20.729681567322082, "learning_rate": 7.118970719772354e-07, "logits/chosen": 0.6396089792251587, "logits/rejected": 0.747488796710968, "logps/chosen": -8.582317352294922, "logps/rejected": -9.629173278808594, "loss": 0.4984, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.582317352294922, "rewards/margins": 1.0468562841415405, "rewards/rejected": -9.629173278808594, "semantic_entropy": 0.0034858197905123234, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 19.291131856489386, "learning_rate": 7.104854155899711e-07, "logits/chosen": 0.6974250078201294, "logits/rejected": 0.7831848859786987, "logps/chosen": -8.711091041564941, "logps/rejected": -9.662050247192383, "loss": 0.5122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.711091041564941, "rewards/margins": 0.9509603381156921, "rewards/rejected": -9.662050247192383, "semantic_entropy": 0.0031486363150179386, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 18.66005580137364, "learning_rate": 7.090717170722817e-07, "logits/chosen": 0.6889894008636475, "logits/rejected": 0.7326347231864929, "logps/chosen": -8.706632614135742, "logps/rejected": -9.896500587463379, "loss": 0.4453, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.706632614135742, "rewards/margins": 1.1898666620254517, "rewards/rejected": -9.896500587463379, "semantic_entropy": 0.002780457027256489, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 22.671781708487075, "learning_rate": 7.076559901398762e-07, "logits/chosen": 0.6582309603691101, "logits/rejected": 0.7270200252532959, "logps/chosen": -8.679912567138672, "logps/rejected": -9.480433464050293, "loss": 0.5314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.679912567138672, "rewards/margins": 0.8005210161209106, "rewards/rejected": -9.480433464050293, "semantic_entropy": 0.002767809433862567, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 22.234310993156075, "learning_rate": 7.062382485281436e-07, "logits/chosen": 0.6792951822280884, "logits/rejected": 0.7309907674789429, "logps/chosen": -8.538030624389648, "logps/rejected": -9.394686698913574, "loss": 0.5261, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.538030624389648, "rewards/margins": 0.8566561937332153, "rewards/rejected": -9.394686698913574, "semantic_entropy": 0.0033909387420862913, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.8011821508407593, "eval_logits/rejected": 0.872314453125, "eval_logps/chosen": -8.710298538208008, "eval_logps/rejected": -9.651128768920898, "eval_loss": 0.5312913060188293, "eval_rewards/accuracies": 0.7136498689651489, "eval_rewards/chosen": -8.710298538208008, "eval_rewards/margins": 0.9408305883407593, "eval_rewards/rejected": -9.651128768920898, "eval_runtime": 34.8607, "eval_samples_per_second": 38.582, "eval_semantic_entropy": 0.002928712172433734, "eval_steps_per_second": 9.667, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 14.920939180127396, "learning_rate": 7.048185059920193e-07, "logits/chosen": 0.6384707093238831, "logits/rejected": 0.7600412368774414, "logps/chosen": -8.579252243041992, "logps/rejected": -9.70583724975586, "loss": 0.4806, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.579252243041992, "rewards/margins": 1.1265841722488403, "rewards/rejected": -9.70583724975586, "semantic_entropy": 0.0032008637208491564, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 18.47727128635857, "learning_rate": 7.033967763058516e-07, "logits/chosen": 0.5698826313018799, "logits/rejected": 0.6842392683029175, "logps/chosen": -8.608453750610352, "logps/rejected": -9.420400619506836, "loss": 0.5163, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.608453750610352, "rewards/margins": 0.8119487762451172, "rewards/rejected": -9.420400619506836, "semantic_entropy": 0.0028821511659771204, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 16.15173827430261, "learning_rate": 7.019730732632681e-07, "logits/chosen": 0.6563664078712463, "logits/rejected": 0.7400893568992615, "logps/chosen": -8.490577697753906, "logps/rejected": -9.582011222839355, "loss": 0.4587, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.490577697753906, "rewards/margins": 1.091435194015503, "rewards/rejected": -9.582011222839355, "semantic_entropy": 0.003659659530967474, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 20.007292439163773, "learning_rate": 7.005474106770418e-07, "logits/chosen": 0.57745361328125, "logits/rejected": 0.6826112866401672, "logps/chosen": -8.502618789672852, "logps/rejected": -9.526583671569824, "loss": 0.5005, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.502618789672852, "rewards/margins": 1.0239640474319458, "rewards/rejected": -9.526583671569824, "semantic_entropy": 0.0039854454807937145, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 16.126086508254385, "learning_rate": 6.991198023789577e-07, "logits/chosen": 0.6350833177566528, "logits/rejected": 0.7082042098045349, "logps/chosen": -8.247810363769531, "logps/rejected": -9.1130952835083, "loss": 0.5063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.247810363769531, "rewards/margins": 0.8652847409248352, "rewards/rejected": -9.1130952835083, "semantic_entropy": 0.0047439588233828545, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 23.717409436049728, "learning_rate": 6.976902622196776e-07, "logits/chosen": 0.5765770077705383, "logits/rejected": 0.6420444250106812, "logps/chosen": -8.346854209899902, "logps/rejected": -9.272150993347168, "loss": 0.5377, "rewards/accuracies": 0.75, "rewards/chosen": -8.346854209899902, "rewards/margins": 0.9252961277961731, "rewards/rejected": -9.272150993347168, "semantic_entropy": 0.003574087517336011, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 17.97651242704695, "learning_rate": 6.962588040686064e-07, "logits/chosen": 0.5552124381065369, "logits/rejected": 0.658178448677063, "logps/chosen": -8.291497230529785, "logps/rejected": -9.098273277282715, "loss": 0.5842, "rewards/accuracies": 0.6875, "rewards/chosen": -8.291497230529785, "rewards/margins": 0.806775689125061, "rewards/rejected": -9.098273277282715, "semantic_entropy": 0.004253287799656391, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 20.217954122529044, "learning_rate": 6.948254418137573e-07, "logits/chosen": 0.5669525861740112, "logits/rejected": 0.6433640718460083, "logps/chosen": -8.215181350708008, "logps/rejected": -9.121828079223633, "loss": 0.5425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.215181350708008, "rewards/margins": 0.9066460728645325, "rewards/rejected": -9.121828079223633, "semantic_entropy": 0.004315282683819532, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 21.969818944329187, "learning_rate": 6.933901893616174e-07, "logits/chosen": 0.5023918151855469, "logits/rejected": 0.614323616027832, "logps/chosen": -8.214559555053711, "logps/rejected": -9.092178344726562, "loss": 0.5167, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.214559555053711, "rewards/margins": 0.8776181936264038, "rewards/rejected": -9.092178344726562, "semantic_entropy": 0.004376448690891266, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 21.680469755063655, "learning_rate": 6.919530606370121e-07, "logits/chosen": 0.48196372389793396, "logits/rejected": 0.5732806921005249, "logps/chosen": -8.17034912109375, "logps/rejected": -9.072924613952637, "loss": 0.5107, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.17034912109375, "rewards/margins": 0.9025766253471375, "rewards/rejected": -9.072924613952637, "semantic_entropy": 0.004180104471743107, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 14.07946177566356, "learning_rate": 6.905140695829706e-07, "logits/chosen": 0.47136348485946655, "logits/rejected": 0.6471112370491028, "logps/chosen": -8.491350173950195, "logps/rejected": -9.42007827758789, "loss": 0.4935, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.491350173950195, "rewards/margins": 0.9287282228469849, "rewards/rejected": -9.42007827758789, "semantic_entropy": 0.003645123215392232, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 23.554632267605992, "learning_rate": 6.890732301605904e-07, "logits/chosen": 0.5830351114273071, "logits/rejected": 0.6560341119766235, "logps/chosen": -8.401416778564453, "logps/rejected": -9.3002290725708, "loss": 0.5216, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.401416778564453, "rewards/margins": 0.8988133668899536, "rewards/rejected": -9.3002290725708, "semantic_entropy": 0.0037794325035065413, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 18.146924951726284, "learning_rate": 6.876305563489021e-07, "logits/chosen": 0.5521279573440552, "logits/rejected": 0.6386257410049438, "logps/chosen": -8.719072341918945, "logps/rejected": -9.739156723022461, "loss": 0.4651, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.719072341918945, "rewards/margins": 1.0200841426849365, "rewards/rejected": -9.739156723022461, "semantic_entropy": 0.002688236068934202, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 20.103380904418128, "learning_rate": 6.861860621447331e-07, "logits/chosen": 0.5402216911315918, "logits/rejected": 0.6326644420623779, "logps/chosen": -8.76352596282959, "logps/rejected": -9.569908142089844, "loss": 0.5324, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.76352596282959, "rewards/margins": 0.8063834309577942, "rewards/rejected": -9.569908142089844, "semantic_entropy": 0.0027508633211255074, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 19.93100548564064, "learning_rate": 6.847397615625725e-07, "logits/chosen": 0.6381164789199829, "logits/rejected": 0.6684954762458801, "logps/chosen": -8.71910572052002, "logps/rejected": -9.549886703491211, "loss": 0.5264, "rewards/accuracies": 0.71875, "rewards/chosen": -8.71910572052002, "rewards/margins": 0.8307819366455078, "rewards/rejected": -9.549886703491211, "semantic_entropy": 0.0028809071518480778, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 15.7435684829995, "learning_rate": 6.83291668634435e-07, "logits/chosen": 0.6417192220687866, "logits/rejected": 0.7400007843971252, "logps/chosen": -8.722017288208008, "logps/rejected": -9.792933464050293, "loss": 0.4668, "rewards/accuracies": 0.78125, "rewards/chosen": -8.722017288208008, "rewards/margins": 1.0709177255630493, "rewards/rejected": -9.792933464050293, "semantic_entropy": 0.003136158687993884, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 19.570820563611054, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.7284759283065796, "logits/rejected": 0.8398087620735168, "logps/chosen": -8.61033821105957, "logps/rejected": -9.802716255187988, "loss": 0.4663, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.61033821105957, "rewards/margins": 1.1923778057098389, "rewards/rejected": -9.802716255187988, "semantic_entropy": 0.003474020166322589, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 19.36403331204118, "learning_rate": 6.803901619550981e-07, "logits/chosen": 0.6692131757736206, "logits/rejected": 0.7076988220214844, "logps/chosen": -8.740633964538574, "logps/rejected": -9.593387603759766, "loss": 0.504, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.740633964538574, "rewards/margins": 0.8527532815933228, "rewards/rejected": -9.593387603759766, "semantic_entropy": 0.0032423834782093763, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 18.303427693985547, "learning_rate": 6.789367763543292e-07, "logits/chosen": 0.7160294651985168, "logits/rejected": 0.7507287859916687, "logps/chosen": -8.617566108703613, "logps/rejected": -9.467153549194336, "loss": 0.5475, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.617566108703613, "rewards/margins": 0.8495874404907227, "rewards/rejected": -9.467153549194336, "semantic_entropy": 0.003361668437719345, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 23.850604033392393, "learning_rate": 6.774816547081714e-07, "logits/chosen": 0.6444199681282043, "logits/rejected": 0.7431300282478333, "logps/chosen": -8.590035438537598, "logps/rejected": -9.325902938842773, "loss": 0.5461, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.590035438537598, "rewards/margins": 0.7358676791191101, "rewards/rejected": -9.325902938842773, "semantic_entropy": 0.0030975653789937496, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 18.77168242134909, "learning_rate": 6.760248111342211e-07, "logits/chosen": 0.6908949017524719, "logits/rejected": 0.7892520427703857, "logps/chosen": -8.384283065795898, "logps/rejected": -9.46776008605957, "loss": 0.468, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.384283065795898, "rewards/margins": 1.0834753513336182, "rewards/rejected": -9.46776008605957, "semantic_entropy": 0.003537180367857218, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 18.137558397174647, "learning_rate": 6.745662597667813e-07, "logits/chosen": 0.6804380416870117, "logits/rejected": 0.7819782495498657, "logps/chosen": -8.316720962524414, "logps/rejected": -9.37002944946289, "loss": 0.4474, "rewards/accuracies": 0.78125, "rewards/chosen": -8.316720962524414, "rewards/margins": 1.0533078908920288, "rewards/rejected": -9.37002944946289, "semantic_entropy": 0.0034739505499601364, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 15.709292621392008, "learning_rate": 6.731060147567236e-07, "logits/chosen": 0.7852478623390198, "logits/rejected": 0.8401540517807007, "logps/chosen": -8.328946113586426, "logps/rejected": -9.2665433883667, "loss": 0.4953, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.328946113586426, "rewards/margins": 0.937597393989563, "rewards/rejected": -9.2665433883667, "semantic_entropy": 0.003876983653753996, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 16.260852565852623, "learning_rate": 6.716440902713515e-07, "logits/chosen": 0.720770001411438, "logits/rejected": 0.7988892793655396, "logps/chosen": -8.437568664550781, "logps/rejected": -9.330516815185547, "loss": 0.4806, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.437568664550781, "rewards/margins": 0.8929487466812134, "rewards/rejected": -9.330516815185547, "semantic_entropy": 0.0032981105614453554, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 20.213224180218113, "learning_rate": 6.701805004942627e-07, "logits/chosen": 0.7619292140007019, "logits/rejected": 0.8122636079788208, "logps/chosen": -8.559782981872559, "logps/rejected": -9.494768142700195, "loss": 0.5013, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.559782981872559, "rewards/margins": 0.9349856376647949, "rewards/rejected": -9.494768142700195, "semantic_entropy": 0.0034507550299167633, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 23.935038029395674, "learning_rate": 6.687152596252119e-07, "logits/chosen": 0.8029264211654663, "logits/rejected": 0.8429144620895386, "logps/chosen": -8.917330741882324, "logps/rejected": -9.750707626342773, "loss": 0.5555, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.917330741882324, "rewards/margins": 0.8333770632743835, "rewards/rejected": -9.750707626342773, "semantic_entropy": 0.0024835984222590923, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 26.97837329749874, "learning_rate": 6.672483818799722e-07, "logits/chosen": 0.756155788898468, "logits/rejected": 0.8390409350395203, "logps/chosen": -9.110207557678223, "logps/rejected": -9.918048858642578, "loss": 0.5293, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.110207557678223, "rewards/margins": 0.8078413009643555, "rewards/rejected": -9.918048858642578, "semantic_entropy": 0.0021289088763296604, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 17.786890203549685, "learning_rate": 6.657798814901978e-07, "logits/chosen": 0.7632160186767578, "logits/rejected": 0.8699263334274292, "logps/chosen": -9.151754379272461, "logps/rejected": -10.03592586517334, "loss": 0.499, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.151754379272461, "rewards/margins": 0.884171187877655, "rewards/rejected": -10.03592586517334, "semantic_entropy": 0.0021057447884231806, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 20.119224807141038, "learning_rate": 6.643097727032863e-07, "logits/chosen": 0.7189488410949707, "logits/rejected": 0.8399428129196167, "logps/chosen": -9.119746208190918, "logps/rejected": -10.26237964630127, "loss": 0.4471, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.119746208190918, "rewards/margins": 1.1426328420639038, "rewards/rejected": -10.26237964630127, "semantic_entropy": 0.0021095951087772846, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 20.813656956825653, "learning_rate": 6.628380697822392e-07, "logits/chosen": 0.7267470955848694, "logits/rejected": 0.823375403881073, "logps/chosen": -9.255183219909668, "logps/rejected": -9.974876403808594, "loss": 0.5697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -9.255183219909668, "rewards/margins": 0.7196929454803467, "rewards/rejected": -9.974876403808594, "semantic_entropy": 0.0019825948402285576, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 21.495996998491723, "learning_rate": 6.61364787005525e-07, "logits/chosen": 0.7436253428459167, "logits/rejected": 0.8189195394515991, "logps/chosen": -8.933206558227539, "logps/rejected": -10.065174102783203, "loss": 0.4548, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -8.933206558227539, "rewards/margins": 1.1319692134857178, "rewards/rejected": -10.065174102783203, "semantic_entropy": 0.0025200708769261837, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 20.492917918741757, "learning_rate": 6.598899386669395e-07, "logits/chosen": 0.6491128206253052, "logits/rejected": 0.7273428440093994, "logps/chosen": -8.97862434387207, "logps/rejected": -9.844133377075195, "loss": 0.5339, "rewards/accuracies": 0.75, "rewards/chosen": -8.97862434387207, "rewards/margins": 0.8655084371566772, "rewards/rejected": -9.844133377075195, "semantic_entropy": 0.0024933055974543095, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 29.52052844095437, "learning_rate": 6.584135390754679e-07, "logits/chosen": 0.618812620639801, "logits/rejected": 0.7136391997337341, "logps/chosen": -8.866046905517578, "logps/rejected": -9.910287857055664, "loss": 0.495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.866046905517578, "rewards/margins": 1.04423987865448, "rewards/rejected": -9.910287857055664, "semantic_entropy": 0.002750510349869728, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 14.963175375540626, "learning_rate": 6.569356025551454e-07, "logits/chosen": 0.6298393607139587, "logits/rejected": 0.6999781727790833, "logps/chosen": -8.907299995422363, "logps/rejected": -9.864768981933594, "loss": 0.5256, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.907299995422363, "rewards/margins": 0.9574697613716125, "rewards/rejected": -9.864768981933594, "semantic_entropy": 0.002827054588124156, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 24.51017226672691, "learning_rate": 6.554561434449186e-07, "logits/chosen": 0.6173363327980042, "logits/rejected": 0.7241848111152649, "logps/chosen": -8.95418930053711, "logps/rejected": -9.835868835449219, "loss": 0.523, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.95418930053711, "rewards/margins": 0.8816791772842407, "rewards/rejected": -9.835868835449219, "semantic_entropy": 0.0022052470594644547, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 23.404717225980484, "learning_rate": 6.539751760985063e-07, "logits/chosen": 0.6575708985328674, "logits/rejected": 0.7401934266090393, "logps/chosen": -9.168517112731934, "logps/rejected": -9.852693557739258, "loss": 0.5703, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -9.168517112731934, "rewards/margins": 0.6841762661933899, "rewards/rejected": -9.852693557739258, "semantic_entropy": 0.0024021922145038843, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 20.395741155582876, "learning_rate": 6.524927148842602e-07, "logits/chosen": 0.6744663119316101, "logits/rejected": 0.7450428009033203, "logps/chosen": -9.043600082397461, "logps/rejected": -9.901880264282227, "loss": 0.5319, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -9.043600082397461, "rewards/margins": 0.8582803010940552, "rewards/rejected": -9.901880264282227, "semantic_entropy": 0.002837617415934801, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 20.903114229317822, "learning_rate": 6.510087741850254e-07, "logits/chosen": 0.6738818287849426, "logits/rejected": 0.7437289953231812, "logps/chosen": -8.894235610961914, "logps/rejected": -9.801753044128418, "loss": 0.5257, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.894235610961914, "rewards/margins": 0.907518744468689, "rewards/rejected": -9.801753044128418, "semantic_entropy": 0.002832833444699645, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 25.8050758741644, "learning_rate": 6.495233683980012e-07, "logits/chosen": 0.6168414354324341, "logits/rejected": 0.659568190574646, "logps/chosen": -9.102640151977539, "logps/rejected": -9.882707595825195, "loss": 0.5363, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.102640151977539, "rewards/margins": 0.780068576335907, "rewards/rejected": -9.882707595825195, "semantic_entropy": 0.002202157862484455, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 22.37455211473883, "learning_rate": 6.480365119346011e-07, "logits/chosen": 0.6996050477027893, "logits/rejected": 0.7838853597640991, "logps/chosen": -8.78913402557373, "logps/rejected": -9.713356018066406, "loss": 0.4855, "rewards/accuracies": 0.75, "rewards/chosen": -8.78913402557373, "rewards/margins": 0.924220085144043, "rewards/rejected": -9.713356018066406, "semantic_entropy": 0.003083221148699522, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 15.384141820184274, "learning_rate": 6.465482192203129e-07, "logits/chosen": 0.705297589302063, "logits/rejected": 0.7580437660217285, "logps/chosen": -8.660966873168945, "logps/rejected": -9.467788696289062, "loss": 0.5043, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.660966873168945, "rewards/margins": 0.8068218231201172, "rewards/rejected": -9.467788696289062, "semantic_entropy": 0.003117068437859416, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 19.949668972130336, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.6838169097900391, "logits/rejected": 0.7196789383888245, "logps/chosen": -8.577276229858398, "logps/rejected": -9.5936861038208, "loss": 0.4727, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.577276229858398, "rewards/margins": 1.01641047000885, "rewards/rejected": -9.5936861038208, "semantic_entropy": 0.003384160343557596, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 23.363596001347148, "learning_rate": 6.435673828105564e-07, "logits/chosen": 0.6700709462165833, "logits/rejected": 0.7268036007881165, "logps/chosen": -8.639312744140625, "logps/rejected": -9.69861888885498, "loss": 0.4863, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.639312744140625, "rewards/margins": 1.0593070983886719, "rewards/rejected": -9.69861888885498, "semantic_entropy": 0.003042886033654213, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 16.823068494049753, "learning_rate": 6.420748680351763e-07, "logits/chosen": 0.7304331064224243, "logits/rejected": 0.7038922309875488, "logps/chosen": -8.778966903686523, "logps/rejected": -9.502559661865234, "loss": 0.5525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.778966903686523, "rewards/margins": 0.7235932350158691, "rewards/rejected": -9.502559661865234, "semantic_entropy": 0.0028461969923228025, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 24.59681543759587, "learning_rate": 6.405809748488032e-07, "logits/chosen": 0.6792068481445312, "logits/rejected": 0.7667452096939087, "logps/chosen": -8.76710319519043, "logps/rejected": -9.83530330657959, "loss": 0.4983, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.76710319519043, "rewards/margins": 1.0682008266448975, "rewards/rejected": -9.83530330657959, "semantic_entropy": 0.0029692454263567924, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 19.14917858388336, "learning_rate": 6.390857177451956e-07, "logits/chosen": 0.5627522468566895, "logits/rejected": 0.6816359758377075, "logps/chosen": -8.73742389678955, "logps/rejected": -9.567540168762207, "loss": 0.5112, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.73742389678955, "rewards/margins": 0.8301169276237488, "rewards/rejected": -9.567540168762207, "semantic_entropy": 0.0030936195980757475, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 17.47734554681335, "learning_rate": 6.375891112313445e-07, "logits/chosen": 0.6170838475227356, "logits/rejected": 0.6742144823074341, "logps/chosen": -8.985275268554688, "logps/rejected": -9.946958541870117, "loss": 0.4664, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.985275268554688, "rewards/margins": 0.9616818428039551, "rewards/rejected": -9.946958541870117, "semantic_entropy": 0.002239787485450506, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 18.087276533577075, "learning_rate": 6.360911698273326e-07, "logits/chosen": 0.6644759774208069, "logits/rejected": 0.7190378904342651, "logps/chosen": -9.075571060180664, "logps/rejected": -9.81869888305664, "loss": 0.5604, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -9.075571060180664, "rewards/margins": 0.7431273460388184, "rewards/rejected": -9.81869888305664, "semantic_entropy": 0.002120462479069829, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 17.443905406025888, "learning_rate": 6.345919080661944e-07, "logits/chosen": 0.6211899518966675, "logits/rejected": 0.6783931851387024, "logps/chosen": -8.705963134765625, "logps/rejected": -9.728338241577148, "loss": 0.4588, "rewards/accuracies": 0.78125, "rewards/chosen": -8.705963134765625, "rewards/margins": 1.0223755836486816, "rewards/rejected": -9.728338241577148, "semantic_entropy": 0.0033381134271621704, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 18.870609176206806, "learning_rate": 6.330913404937737e-07, "logits/chosen": 0.6599079370498657, "logits/rejected": 0.736240565776825, "logps/chosen": -8.820722579956055, "logps/rejected": -9.812540054321289, "loss": 0.49, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.820722579956055, "rewards/margins": 0.9918166995048523, "rewards/rejected": -9.812540054321289, "semantic_entropy": 0.002813478233292699, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 20.325954850202407, "learning_rate": 6.315894816685838e-07, "logits/chosen": 0.6205192804336548, "logits/rejected": 0.6998498439788818, "logps/chosen": -8.68531608581543, "logps/rejected": -9.498059272766113, "loss": 0.5063, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.68531608581543, "rewards/margins": 0.8127420544624329, "rewards/rejected": -9.498059272766113, "semantic_entropy": 0.0029031294398009777, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 17.30912171919916, "learning_rate": 6.300863461616657e-07, "logits/chosen": 0.6297236680984497, "logits/rejected": 0.6760424971580505, "logps/chosen": -8.35009765625, "logps/rejected": -9.166117668151855, "loss": 0.5628, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.35009765625, "rewards/margins": 0.816020131111145, "rewards/rejected": -9.166117668151855, "semantic_entropy": 0.003771452931687236, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 15.64812739081427, "learning_rate": 6.285819485564465e-07, "logits/chosen": 0.5272424817085266, "logits/rejected": 0.5996197462081909, "logps/chosen": -8.598016738891602, "logps/rejected": -9.50200080871582, "loss": 0.4918, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.598016738891602, "rewards/margins": 0.9039848446846008, "rewards/rejected": -9.50200080871582, "semantic_entropy": 0.003393507096916437, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 22.54029918814073, "learning_rate": 6.270763034485986e-07, "logits/chosen": 0.6191005706787109, "logits/rejected": 0.6678223609924316, "logps/chosen": -8.656926155090332, "logps/rejected": -9.598608016967773, "loss": 0.5089, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.656926155090332, "rewards/margins": 0.9416826963424683, "rewards/rejected": -9.598608016967773, "semantic_entropy": 0.0036796010099351406, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 38.66235795954998, "learning_rate": 6.255694254458972e-07, "logits/chosen": 0.5672577619552612, "logits/rejected": 0.6477295160293579, "logps/chosen": -8.749332427978516, "logps/rejected": -9.726335525512695, "loss": 0.5089, "rewards/accuracies": 0.75, "rewards/chosen": -8.749332427978516, "rewards/margins": 0.9770025014877319, "rewards/rejected": -9.726335525512695, "semantic_entropy": 0.003146649803966284, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 24.587723329790435, "learning_rate": 6.240613291680795e-07, "logits/chosen": 0.532563328742981, "logits/rejected": 0.6375452280044556, "logps/chosen": -8.473979949951172, "logps/rejected": -9.374165534973145, "loss": 0.5416, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.473979949951172, "rewards/margins": 0.9001848101615906, "rewards/rejected": -9.374165534973145, "semantic_entropy": 0.004033363424241543, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 17.100229454274775, "learning_rate": 6.225520292467021e-07, "logits/chosen": 0.5713559985160828, "logits/rejected": 0.6829615831375122, "logps/chosen": -8.452000617980957, "logps/rejected": -9.610584259033203, "loss": 0.4244, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.452000617980957, "rewards/margins": 1.1585838794708252, "rewards/rejected": -9.610584259033203, "semantic_entropy": 0.0038808733224868774, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 22.766099328641907, "learning_rate": 6.210415403249993e-07, "logits/chosen": 0.5507108569145203, "logits/rejected": 0.6832348704338074, "logps/chosen": -8.426264762878418, "logps/rejected": -9.474390029907227, "loss": 0.49, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.426264762878418, "rewards/margins": 1.0481255054473877, "rewards/rejected": -9.474390029907227, "semantic_entropy": 0.004706330597400665, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 21.28132833379913, "learning_rate": 6.195298770577415e-07, "logits/chosen": 0.6715400815010071, "logits/rejected": 0.6841408610343933, "logps/chosen": -8.57282829284668, "logps/rejected": -9.571008682250977, "loss": 0.5092, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.57282829284668, "rewards/margins": 0.9981800317764282, "rewards/rejected": -9.571008682250977, "semantic_entropy": 0.0033726401161402464, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 17.178979708018453, "learning_rate": 6.180170541110923e-07, "logits/chosen": 0.644763708114624, "logits/rejected": 0.746247410774231, "logps/chosen": -8.67873477935791, "logps/rejected": -9.667515754699707, "loss": 0.4689, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.67873477935791, "rewards/margins": 0.9887820482254028, "rewards/rejected": -9.667515754699707, "semantic_entropy": 0.0032643512822687626, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 18.421052634418867, "learning_rate": 6.165030861624663e-07, "logits/chosen": 0.5887877345085144, "logits/rejected": 0.7074635624885559, "logps/chosen": -8.820067405700684, "logps/rejected": -10.023954391479492, "loss": 0.4421, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.820067405700684, "rewards/margins": 1.2038882970809937, "rewards/rejected": -10.023954391479492, "semantic_entropy": 0.002584748435765505, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 18.781234298542874, "learning_rate": 6.149879879003876e-07, "logits/chosen": 0.7198264598846436, "logits/rejected": 0.7411429286003113, "logps/chosen": -8.748977661132812, "logps/rejected": -9.773608207702637, "loss": 0.4872, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.748977661132812, "rewards/margins": 1.0246312618255615, "rewards/rejected": -9.773608207702637, "semantic_entropy": 0.003493456868454814, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 16.626832554859085, "learning_rate": 6.13471774024346e-07, "logits/chosen": 0.6111767292022705, "logits/rejected": 0.6929227113723755, "logps/chosen": -8.618370056152344, "logps/rejected": -9.645769119262695, "loss": 0.4634, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.618370056152344, "rewards/margins": 1.0273983478546143, "rewards/rejected": -9.645769119262695, "semantic_entropy": 0.003336191177368164, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 15.275820526471591, "learning_rate": 6.119544592446551e-07, "logits/chosen": 0.6123021841049194, "logits/rejected": 0.6909358501434326, "logps/chosen": -8.73341178894043, "logps/rejected": -9.467869758605957, "loss": 0.5598, "rewards/accuracies": 0.6875, "rewards/chosen": -8.73341178894043, "rewards/margins": 0.7344561815261841, "rewards/rejected": -9.467869758605957, "semantic_entropy": 0.0026931720785796642, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 22.157104502252846, "learning_rate": 6.104360582823096e-07, "logits/chosen": 0.7188630104064941, "logits/rejected": 0.7658997774124146, "logps/chosen": -8.619566917419434, "logps/rejected": -9.543168067932129, "loss": 0.4784, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.619566917419434, "rewards/margins": 0.9236003160476685, "rewards/rejected": -9.543168067932129, "semantic_entropy": 0.003137335879728198, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 20.352841774469262, "learning_rate": 6.089165858688423e-07, "logits/chosen": 0.6846107244491577, "logits/rejected": 0.7849973440170288, "logps/chosen": -8.482339859008789, "logps/rejected": -9.482155799865723, "loss": 0.5162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.482339859008789, "rewards/margins": 0.9998153448104858, "rewards/rejected": -9.482155799865723, "semantic_entropy": 0.0033546772319823503, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 12.410524435742113, "learning_rate": 6.073960567461811e-07, "logits/chosen": 0.7148826718330383, "logits/rejected": 0.8275319933891296, "logps/chosen": -8.265142440795898, "logps/rejected": -9.430082321166992, "loss": 0.417, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.265142440795898, "rewards/margins": 1.1649402379989624, "rewards/rejected": -9.430082321166992, "semantic_entropy": 0.0043460773304104805, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 19.113883533650075, "learning_rate": 6.058744856665065e-07, "logits/chosen": 0.63080894947052, "logits/rejected": 0.6792045831680298, "logps/chosen": -8.33402156829834, "logps/rejected": -9.474609375, "loss": 0.4612, "rewards/accuracies": 0.78125, "rewards/chosen": -8.33402156829834, "rewards/margins": 1.140586256980896, "rewards/rejected": -9.474609375, "semantic_entropy": 0.0038649775087833405, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 19.573081780536924, "learning_rate": 6.043518873921074e-07, "logits/chosen": 0.6687484979629517, "logits/rejected": 0.7621025443077087, "logps/chosen": -8.205583572387695, "logps/rejected": -9.156460762023926, "loss": 0.4883, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.205583572387695, "rewards/margins": 0.9508770108222961, "rewards/rejected": -9.156460762023926, "semantic_entropy": 0.004209198523312807, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 19.22285858261932, "learning_rate": 6.028282766952393e-07, "logits/chosen": 0.6872994303703308, "logits/rejected": 0.7447125315666199, "logps/chosen": -8.262472152709961, "logps/rejected": -9.375197410583496, "loss": 0.4668, "rewards/accuracies": 0.8125, "rewards/chosen": -8.262472152709961, "rewards/margins": 1.1127252578735352, "rewards/rejected": -9.375197410583496, "semantic_entropy": 0.004294519778341055, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 28.965956806202943, "learning_rate": 6.013036683579798e-07, "logits/chosen": 0.7001906633377075, "logits/rejected": 0.7653275728225708, "logps/chosen": -8.254480361938477, "logps/rejected": -9.233253479003906, "loss": 0.5039, "rewards/accuracies": 0.78125, "rewards/chosen": -8.254480361938477, "rewards/margins": 0.9787724614143372, "rewards/rejected": -9.233253479003906, "semantic_entropy": 0.00447105010971427, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 19.46901251194481, "learning_rate": 5.997780771720854e-07, "logits/chosen": 0.6296931505203247, "logits/rejected": 0.7145162224769592, "logps/chosen": -8.382627487182617, "logps/rejected": -9.445598602294922, "loss": 0.4638, "rewards/accuracies": 0.78125, "rewards/chosen": -8.382627487182617, "rewards/margins": 1.0629713535308838, "rewards/rejected": -9.445598602294922, "semantic_entropy": 0.004158531315624714, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 19.53223501670479, "learning_rate": 5.982515179388486e-07, "logits/chosen": 0.7034457325935364, "logits/rejected": 0.7711877226829529, "logps/chosen": -8.465707778930664, "logps/rejected": -9.426530838012695, "loss": 0.5123, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.465707778930664, "rewards/margins": 0.9608221054077148, "rewards/rejected": -9.426530838012695, "semantic_entropy": 0.003798137651756406, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 14.100323670272886, "learning_rate": 5.967240054689541e-07, "logits/chosen": 0.6083649396896362, "logits/rejected": 0.6571983098983765, "logps/chosen": -8.479659080505371, "logps/rejected": -9.488851547241211, "loss": 0.4886, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.479659080505371, "rewards/margins": 1.0091919898986816, "rewards/rejected": -9.488851547241211, "semantic_entropy": 0.0037951588165014982, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 19.799070365162166, "learning_rate": 5.951955545823342e-07, "logits/chosen": 0.6102844476699829, "logits/rejected": 0.6613792181015015, "logps/chosen": -8.798731803894043, "logps/rejected": -9.752110481262207, "loss": 0.5194, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.798731803894043, "rewards/margins": 0.9533787965774536, "rewards/rejected": -9.752110481262207, "semantic_entropy": 0.003143253503367305, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 17.219558141254097, "learning_rate": 5.936661801080263e-07, "logits/chosen": 0.5687362551689148, "logits/rejected": 0.634920597076416, "logps/chosen": -8.662237167358398, "logps/rejected": -9.508859634399414, "loss": 0.5463, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -8.662237167358398, "rewards/margins": 0.8466218709945679, "rewards/rejected": -9.508859634399414, "semantic_entropy": 0.0033071953803300858, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 20.01873928855486, "learning_rate": 5.92135896884028e-07, "logits/chosen": 0.6002562642097473, "logits/rejected": 0.696466326713562, "logps/chosen": -8.673624992370605, "logps/rejected": -9.823293685913086, "loss": 0.455, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.673624992370605, "rewards/margins": 1.1496690511703491, "rewards/rejected": -9.823293685913086, "semantic_entropy": 0.003196306060999632, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 30.437982634244353, "learning_rate": 5.906047197571541e-07, "logits/chosen": 0.5805534720420837, "logits/rejected": 0.5821332931518555, "logps/chosen": -8.47557258605957, "logps/rejected": -9.357189178466797, "loss": 0.5343, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.47557258605957, "rewards/margins": 0.8816182017326355, "rewards/rejected": -9.357189178466797, "semantic_entropy": 0.0038487245328724384, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 14.831624104584504, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.5990924835205078, "logits/rejected": 0.6138975024223328, "logps/chosen": -8.312957763671875, "logps/rejected": -9.31361198425293, "loss": 0.5031, "rewards/accuracies": 0.75, "rewards/chosen": -8.312957763671875, "rewards/margins": 1.0006548166275024, "rewards/rejected": -9.31361198425293, "semantic_entropy": 0.004499537404626608, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 21.33910911582425, "learning_rate": 5.875397432252569e-07, "logits/chosen": 0.5481540560722351, "logits/rejected": 0.6002416610717773, "logps/chosen": -8.367044448852539, "logps/rejected": -9.348922729492188, "loss": 0.4879, "rewards/accuracies": 0.78125, "rewards/chosen": -8.367044448852539, "rewards/margins": 0.9818779230117798, "rewards/rejected": -9.348922729492188, "semantic_entropy": 0.004131897818297148, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.6895690560340881, "eval_logits/rejected": 0.749624490737915, "eval_logps/chosen": -8.626724243164062, "eval_logps/rejected": -9.53298282623291, "eval_loss": 0.5264463424682617, "eval_rewards/accuracies": 0.7218101024627686, "eval_rewards/chosen": -8.626724243164062, "eval_rewards/margins": 0.9062579870223999, "eval_rewards/rejected": -9.53298282623291, "eval_runtime": 35.1374, "eval_samples_per_second": 38.278, "eval_semantic_entropy": 0.0033146331552416086, "eval_steps_per_second": 9.591, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 15.846955607641894, "learning_rate": 5.860059735566491e-07, "logits/chosen": 0.4758935868740082, "logits/rejected": 0.5631710290908813, "logps/chosen": -8.500029563903809, "logps/rejected": -9.449440002441406, "loss": 0.5011, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.500029563903809, "rewards/margins": 0.9494104385375977, "rewards/rejected": -9.449440002441406, "semantic_entropy": 0.004371006041765213, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 24.705548492382672, "learning_rate": 5.844713694577087e-07, "logits/chosen": 0.5791555643081665, "logits/rejected": 0.6237837672233582, "logps/chosen": -8.626651763916016, "logps/rejected": -9.473905563354492, "loss": 0.5144, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.626651763916016, "rewards/margins": 0.8472524881362915, "rewards/rejected": -9.473905563354492, "semantic_entropy": 0.003329088445752859, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 14.693823746449954, "learning_rate": 5.829359458171714e-07, "logits/chosen": 0.5436751842498779, "logits/rejected": 0.5992386341094971, "logps/chosen": -8.613186836242676, "logps/rejected": -9.71760368347168, "loss": 0.4307, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.613186836242676, "rewards/margins": 1.1044175624847412, "rewards/rejected": -9.71760368347168, "semantic_entropy": 0.003140996443107724, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 15.499555019749941, "learning_rate": 5.81399717531724e-07, "logits/chosen": 0.5179445147514343, "logits/rejected": 0.6111316084861755, "logps/chosen": -8.66978645324707, "logps/rejected": -9.439632415771484, "loss": 0.5827, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.66978645324707, "rewards/margins": 0.7698466777801514, "rewards/rejected": -9.439632415771484, "semantic_entropy": 0.0032921708188951015, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 16.093368752669864, "learning_rate": 5.798626995058602e-07, "logits/chosen": 0.5145665407180786, "logits/rejected": 0.6263571977615356, "logps/chosen": -8.70081901550293, "logps/rejected": -9.660847663879395, "loss": 0.4992, "rewards/accuracies": 0.71875, "rewards/chosen": -8.70081901550293, "rewards/margins": 0.9600294232368469, "rewards/rejected": -9.660847663879395, "semantic_entropy": 0.002892556134611368, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 13.79559161731438, "learning_rate": 5.783249066517354e-07, "logits/chosen": 0.5084502696990967, "logits/rejected": 0.5790830850601196, "logps/chosen": -8.357169151306152, "logps/rejected": -9.426858901977539, "loss": 0.4507, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.357169151306152, "rewards/margins": 1.0696887969970703, "rewards/rejected": -9.426858901977539, "semantic_entropy": 0.0035817469470202923, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 17.910427446812193, "learning_rate": 5.767863538890228e-07, "logits/chosen": 0.5757554769515991, "logits/rejected": 0.6622332334518433, "logps/chosen": -8.620783805847168, "logps/rejected": -9.76025390625, "loss": 0.4271, "rewards/accuracies": 0.8125, "rewards/chosen": -8.620783805847168, "rewards/margins": 1.1394703388214111, "rewards/rejected": -9.76025390625, "semantic_entropy": 0.0031910459510982037, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 17.705706566190077, "learning_rate": 5.75247056144768e-07, "logits/chosen": 0.5833605527877808, "logits/rejected": 0.6117344498634338, "logps/chosen": -8.490338325500488, "logps/rejected": -9.415372848510742, "loss": 0.5481, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.490338325500488, "rewards/margins": 0.925035834312439, "rewards/rejected": -9.415372848510742, "semantic_entropy": 0.0037457395810633898, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 19.220022114950083, "learning_rate": 5.737070283532444e-07, "logits/chosen": 0.6395395994186401, "logits/rejected": 0.6757252812385559, "logps/chosen": -8.574124336242676, "logps/rejected": -9.431645393371582, "loss": 0.5835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.574124336242676, "rewards/margins": 0.8575227856636047, "rewards/rejected": -9.431645393371582, "semantic_entropy": 0.0034476309083402157, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 16.469084913870834, "learning_rate": 5.721662854558084e-07, "logits/chosen": 0.5754357576370239, "logits/rejected": 0.6329125165939331, "logps/chosen": -8.597195625305176, "logps/rejected": -9.660018920898438, "loss": 0.4696, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.597195625305176, "rewards/margins": 1.0628234148025513, "rewards/rejected": -9.660018920898438, "semantic_entropy": 0.003124454291537404, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 18.533405817853968, "learning_rate": 5.706248424007545e-07, "logits/chosen": 0.4836948812007904, "logits/rejected": 0.5998759269714355, "logps/chosen": -8.543926239013672, "logps/rejected": -9.415602684020996, "loss": 0.5262, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.543926239013672, "rewards/margins": 0.871677577495575, "rewards/rejected": -9.415602684020996, "semantic_entropy": 0.0034175370819866657, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 16.66857458253813, "learning_rate": 5.690827141431699e-07, "logits/chosen": 0.5200189352035522, "logits/rejected": 0.6539384126663208, "logps/chosen": -8.513689041137695, "logps/rejected": -9.264158248901367, "loss": 0.5351, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.513689041137695, "rewards/margins": 0.7504681348800659, "rewards/rejected": -9.264158248901367, "semantic_entropy": 0.00305316224694252, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 21.309556253730577, "learning_rate": 5.675399156447897e-07, "logits/chosen": 0.5738528370857239, "logits/rejected": 0.6424544453620911, "logps/chosen": -8.373230934143066, "logps/rejected": -9.177302360534668, "loss": 0.5274, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.373230934143066, "rewards/margins": 0.8040705919265747, "rewards/rejected": -9.177302360534668, "semantic_entropy": 0.0038657269906252623, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 18.48692651527053, "learning_rate": 5.659964618738515e-07, "logits/chosen": 0.5925968289375305, "logits/rejected": 0.6500064730644226, "logps/chosen": -8.42739486694336, "logps/rejected": -9.342714309692383, "loss": 0.524, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.42739486694336, "rewards/margins": 0.9153194427490234, "rewards/rejected": -9.342714309692383, "semantic_entropy": 0.0032528643496334553, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 18.694567668937122, "learning_rate": 5.644523678049509e-07, "logits/chosen": 0.5311469435691833, "logits/rejected": 0.6227244138717651, "logps/chosen": -8.448528289794922, "logps/rejected": -9.284102439880371, "loss": 0.5258, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.448528289794922, "rewards/margins": 0.8355741500854492, "rewards/rejected": -9.284102439880371, "semantic_entropy": 0.0036838327068835497, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 20.341898488782054, "learning_rate": 5.629076484188952e-07, "logits/chosen": 0.6697776913642883, "logits/rejected": 0.7325208187103271, "logps/chosen": -8.417803764343262, "logps/rejected": -9.420540809631348, "loss": 0.4782, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.417803764343262, "rewards/margins": 1.0027358531951904, "rewards/rejected": -9.420540809631348, "semantic_entropy": 0.0036022099666297436, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 17.610305324362457, "learning_rate": 5.613623187025587e-07, "logits/chosen": 0.5705369710922241, "logits/rejected": 0.6492313146591187, "logps/chosen": -8.502610206604004, "logps/rejected": -9.509003639221191, "loss": 0.4879, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.502610206604004, "rewards/margins": 1.0063927173614502, "rewards/rejected": -9.509003639221191, "semantic_entropy": 0.003285133745521307, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 17.881369724017024, "learning_rate": 5.598163936487369e-07, "logits/chosen": 0.573552131652832, "logits/rejected": 0.6860645413398743, "logps/chosen": -8.498343467712402, "logps/rejected": -9.553579330444336, "loss": 0.4739, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.498343467712402, "rewards/margins": 1.055237054824829, "rewards/rejected": -9.553579330444336, "semantic_entropy": 0.0032834571320563555, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 17.628648766982945, "learning_rate": 5.582698882560017e-07, "logits/chosen": 0.6237468719482422, "logits/rejected": 0.7165063619613647, "logps/chosen": -8.482809066772461, "logps/rejected": -9.492294311523438, "loss": 0.4927, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.482809066772461, "rewards/margins": 1.0094853639602661, "rewards/rejected": -9.492294311523438, "semantic_entropy": 0.003665131749585271, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 17.76187822982553, "learning_rate": 5.567228175285549e-07, "logits/chosen": 0.6243129968643188, "logits/rejected": 0.7127053737640381, "logps/chosen": -8.367454528808594, "logps/rejected": -9.463773727416992, "loss": 0.455, "rewards/accuracies": 0.75, "rewards/chosen": -8.367454528808594, "rewards/margins": 1.0963184833526611, "rewards/rejected": -9.463773727416992, "semantic_entropy": 0.003789290087297559, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 22.486981787052432, "learning_rate": 5.551751964760838e-07, "logits/chosen": 0.7026504278182983, "logits/rejected": 0.7289483547210693, "logps/chosen": -8.396336555480957, "logps/rejected": -9.454975128173828, "loss": 0.4576, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.396336555480957, "rewards/margins": 1.058638334274292, "rewards/rejected": -9.454975128173828, "semantic_entropy": 0.003976074513047934, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 21.34998407613261, "learning_rate": 5.536270401136145e-07, "logits/chosen": 0.6059376001358032, "logits/rejected": 0.6654237508773804, "logps/chosen": -8.55673599243164, "logps/rejected": -9.484312057495117, "loss": 0.4952, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.55673599243164, "rewards/margins": 0.927575945854187, "rewards/rejected": -9.484312057495117, "semantic_entropy": 0.0034220025409013033, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 25.338171297276553, "learning_rate": 5.520783634613667e-07, "logits/chosen": 0.6434666514396667, "logits/rejected": 0.7561715841293335, "logps/chosen": -8.727119445800781, "logps/rejected": -9.780064582824707, "loss": 0.5051, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.727119445800781, "rewards/margins": 1.0529462099075317, "rewards/rejected": -9.780064582824707, "semantic_entropy": 0.002783454256132245, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 19.446663229697716, "learning_rate": 5.505291815446082e-07, "logits/chosen": 0.622826099395752, "logits/rejected": 0.6913628578186035, "logps/chosen": -8.684412002563477, "logps/rejected": -9.680010795593262, "loss": 0.5004, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.684412002563477, "rewards/margins": 0.9955987930297852, "rewards/rejected": -9.680010795593262, "semantic_entropy": 0.0030565441120415926, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 21.499839435107912, "learning_rate": 5.489795093935089e-07, "logits/chosen": 0.66752690076828, "logits/rejected": 0.7305563688278198, "logps/chosen": -8.636419296264648, "logps/rejected": -9.558130264282227, "loss": 0.5297, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.636419296264648, "rewards/margins": 0.9217103123664856, "rewards/rejected": -9.558130264282227, "semantic_entropy": 0.0032252557575702667, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 17.043070253231157, "learning_rate": 5.474293620429946e-07, "logits/chosen": 0.6017109155654907, "logits/rejected": 0.6921178698539734, "logps/chosen": -8.539863586425781, "logps/rejected": -9.874781608581543, "loss": 0.455, "rewards/accuracies": 0.8125, "rewards/chosen": -8.539863586425781, "rewards/margins": 1.334917426109314, "rewards/rejected": -9.874781608581543, "semantic_entropy": 0.0031048119999468327, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 17.31470565155956, "learning_rate": 5.458787545326018e-07, "logits/chosen": 0.6002456545829773, "logits/rejected": 0.6670821905136108, "logps/chosen": -8.838860511779785, "logps/rejected": -9.790765762329102, "loss": 0.4888, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.838860511779785, "rewards/margins": 0.9519071578979492, "rewards/rejected": -9.790765762329102, "semantic_entropy": 0.0028922937344759703, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 18.420355066758702, "learning_rate": 5.443277019063311e-07, "logits/chosen": 0.6272684335708618, "logits/rejected": 0.7411568760871887, "logps/chosen": -8.951470375061035, "logps/rejected": -10.118718147277832, "loss": 0.4787, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.951470375061035, "rewards/margins": 1.1672481298446655, "rewards/rejected": -10.118718147277832, "semantic_entropy": 0.0028697990346699953, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 24.156663129325842, "learning_rate": 5.427762192125023e-07, "logits/chosen": 0.6460695862770081, "logits/rejected": 0.7259084582328796, "logps/chosen": -8.902268409729004, "logps/rejected": -9.860254287719727, "loss": 0.5189, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.902268409729004, "rewards/margins": 0.9579856991767883, "rewards/rejected": -9.860254287719727, "semantic_entropy": 0.0026515666395425797, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 28.217781936607324, "learning_rate": 5.41224321503607e-07, "logits/chosen": 0.6646770238876343, "logits/rejected": 0.7777436375617981, "logps/chosen": -8.907236099243164, "logps/rejected": -10.02210807800293, "loss": 0.422, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -8.907236099243164, "rewards/margins": 1.1148706674575806, "rewards/rejected": -10.02210807800293, "semantic_entropy": 0.0026082415133714676, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 22.254732967367147, "learning_rate": 5.396720238361637e-07, "logits/chosen": 0.7216917872428894, "logits/rejected": 0.781305193901062, "logps/chosen": -8.932952880859375, "logps/rejected": -9.814626693725586, "loss": 0.5321, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.932952880859375, "rewards/margins": 0.8816744089126587, "rewards/rejected": -9.814626693725586, "semantic_entropy": 0.0031717985402792692, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 17.792398556391674, "learning_rate": 5.381193412705711e-07, "logits/chosen": 0.6349023580551147, "logits/rejected": 0.7170180678367615, "logps/chosen": -8.807271003723145, "logps/rejected": -9.855276107788086, "loss": 0.4582, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.807271003723145, "rewards/margins": 1.0480067729949951, "rewards/rejected": -9.855276107788086, "semantic_entropy": 0.0029146361630409956, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 16.167553601713156, "learning_rate": 5.365662888709622e-07, "logits/chosen": 0.6512748599052429, "logits/rejected": 0.7206937074661255, "logps/chosen": -8.949135780334473, "logps/rejected": -10.088407516479492, "loss": 0.4511, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.949135780334473, "rewards/margins": 1.139272689819336, "rewards/rejected": -10.088407516479492, "semantic_entropy": 0.0026951334439218044, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 21.334679501529834, "learning_rate": 5.350128817050585e-07, "logits/chosen": 0.6061184406280518, "logits/rejected": 0.6971312761306763, "logps/chosen": -9.021596908569336, "logps/rejected": -9.995620727539062, "loss": 0.5042, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.021596908569336, "rewards/margins": 0.974023163318634, "rewards/rejected": -9.995620727539062, "semantic_entropy": 0.002106505911797285, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 27.712943840731256, "learning_rate": 5.334591348440229e-07, "logits/chosen": 0.6605676412582397, "logits/rejected": 0.7498981952667236, "logps/chosen": -8.881559371948242, "logps/rejected": -9.735440254211426, "loss": 0.5282, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.881559371948242, "rewards/margins": 0.8538818359375, "rewards/rejected": -9.735440254211426, "semantic_entropy": 0.0024334299378097057, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 14.991037570020522, "learning_rate": 5.319050633623141e-07, "logits/chosen": 0.6337238550186157, "logits/rejected": 0.7245572805404663, "logps/chosen": -8.76764965057373, "logps/rejected": -9.690933227539062, "loss": 0.482, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.76764965057373, "rewards/margins": 0.9232838749885559, "rewards/rejected": -9.690933227539062, "semantic_entropy": 0.002793360035866499, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 24.077369910851743, "learning_rate": 5.303506823375409e-07, "logits/chosen": 0.5908278226852417, "logits/rejected": 0.7130194902420044, "logps/chosen": -8.76014518737793, "logps/rejected": -9.954129219055176, "loss": 0.5029, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.76014518737793, "rewards/margins": 1.1939831972122192, "rewards/rejected": -9.954129219055176, "semantic_entropy": 0.003040383802726865, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 15.211762910677832, "learning_rate": 5.287960068503143e-07, "logits/chosen": 0.6387141942977905, "logits/rejected": 0.7284534573554993, "logps/chosen": -8.648200035095215, "logps/rejected": -9.790531158447266, "loss": 0.4399, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.648200035095215, "rewards/margins": 1.1423308849334717, "rewards/rejected": -9.790531158447266, "semantic_entropy": 0.003230876522138715, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 20.214549437081253, "learning_rate": 5.272410519841032e-07, "logits/chosen": 0.6860362887382507, "logits/rejected": 0.7700978517532349, "logps/chosen": -8.748394012451172, "logps/rejected": -9.925196647644043, "loss": 0.4639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.748394012451172, "rewards/margins": 1.1768031120300293, "rewards/rejected": -9.925196647644043, "semantic_entropy": 0.002992126392200589, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 13.676962647539636, "learning_rate": 5.256858328250861e-07, "logits/chosen": 0.6779240965843201, "logits/rejected": 0.7815280556678772, "logps/chosen": -8.563261985778809, "logps/rejected": -9.472494125366211, "loss": 0.5373, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.563261985778809, "rewards/margins": 0.9092334508895874, "rewards/rejected": -9.472494125366211, "semantic_entropy": 0.0036033024080097675, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 35.682884824697986, "learning_rate": 5.241303644620063e-07, "logits/chosen": 0.6307097673416138, "logits/rejected": 0.7322098612785339, "logps/chosen": -8.694342613220215, "logps/rejected": -9.436209678649902, "loss": 0.5628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.694342613220215, "rewards/margins": 0.7418667078018188, "rewards/rejected": -9.436209678649902, "semantic_entropy": 0.003090116661041975, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 20.559441487883948, "learning_rate": 5.225746619860248e-07, "logits/chosen": 0.651374101638794, "logits/rejected": 0.7265350222587585, "logps/chosen": -8.542802810668945, "logps/rejected": -9.415166854858398, "loss": 0.5648, "rewards/accuracies": 0.75, "rewards/chosen": -8.542802810668945, "rewards/margins": 0.8723649978637695, "rewards/rejected": -9.415166854858398, "semantic_entropy": 0.0034926377702504396, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 24.620427653936158, "learning_rate": 5.210187404905735e-07, "logits/chosen": 0.7381612658500671, "logits/rejected": 0.7867849469184875, "logps/chosen": -8.523360252380371, "logps/rejected": -9.46071720123291, "loss": 0.5041, "rewards/accuracies": 0.75, "rewards/chosen": -8.523360252380371, "rewards/margins": 0.9373563528060913, "rewards/rejected": -9.46071720123291, "semantic_entropy": 0.003742937697097659, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 20.150742954590974, "learning_rate": 5.194626150712098e-07, "logits/chosen": 0.6840203404426575, "logits/rejected": 0.7242007851600647, "logps/chosen": -8.462261199951172, "logps/rejected": -9.2748384475708, "loss": 0.5223, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.462261199951172, "rewards/margins": 0.8125771284103394, "rewards/rejected": -9.2748384475708, "semantic_entropy": 0.0034177147317677736, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 22.925549219305793, "learning_rate": 5.179063008254695e-07, "logits/chosen": 0.6633858680725098, "logits/rejected": 0.7617511749267578, "logps/chosen": -8.355466842651367, "logps/rejected": -9.252098083496094, "loss": 0.5333, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.355466842651367, "rewards/margins": 0.896629810333252, "rewards/rejected": -9.252098083496094, "semantic_entropy": 0.003791496157646179, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 24.09406066747849, "learning_rate": 5.163498128527199e-07, "logits/chosen": 0.6741195917129517, "logits/rejected": 0.7528184056282043, "logps/chosen": -8.633956909179688, "logps/rejected": -9.625297546386719, "loss": 0.5164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.633956909179688, "rewards/margins": 0.9913405179977417, "rewards/rejected": -9.625297546386719, "semantic_entropy": 0.0035090327728539705, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 17.245493463371307, "learning_rate": 5.147931662540144e-07, "logits/chosen": 0.7293022871017456, "logits/rejected": 0.8059667348861694, "logps/chosen": -8.407529830932617, "logps/rejected": -9.319184303283691, "loss": 0.5002, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.407529830932617, "rewards/margins": 0.9116536378860474, "rewards/rejected": -9.319184303283691, "semantic_entropy": 0.004189362749457359, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 22.579101949094103, "learning_rate": 5.132363761319449e-07, "logits/chosen": 0.6055505275726318, "logits/rejected": 0.6722275018692017, "logps/chosen": -8.391597747802734, "logps/rejected": -9.499109268188477, "loss": 0.4719, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.391597747802734, "rewards/margins": 1.1075109243392944, "rewards/rejected": -9.499109268188477, "semantic_entropy": 0.003600142430514097, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 42.73415984103176, "learning_rate": 5.116794575904962e-07, "logits/chosen": 0.6817172765731812, "logits/rejected": 0.7639212608337402, "logps/chosen": -8.343725204467773, "logps/rejected": -9.283025741577148, "loss": 0.5163, "rewards/accuracies": 0.75, "rewards/chosen": -8.343725204467773, "rewards/margins": 0.939300537109375, "rewards/rejected": -9.283025741577148, "semantic_entropy": 0.004117668606340885, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 13.671217913035836, "learning_rate": 5.101224257348987e-07, "logits/chosen": 0.6588679552078247, "logits/rejected": 0.7599143385887146, "logps/chosen": -8.530394554138184, "logps/rejected": -9.683368682861328, "loss": 0.4385, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.530394554138184, "rewards/margins": 1.1529743671417236, "rewards/rejected": -9.683368682861328, "semantic_entropy": 0.003528149798512459, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 17.926663757907765, "learning_rate": 5.085652956714823e-07, "logits/chosen": 0.6311002373695374, "logits/rejected": 0.7291450500488281, "logps/chosen": -8.84853744506836, "logps/rejected": -9.728075981140137, "loss": 0.5199, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.84853744506836, "rewards/margins": 0.8795391917228699, "rewards/rejected": -9.728075981140137, "semantic_entropy": 0.0028904026839882135, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 17.340322531594975, "learning_rate": 5.070080825075298e-07, "logits/chosen": 0.7018830180168152, "logits/rejected": 0.8387192487716675, "logps/chosen": -8.556188583374023, "logps/rejected": -9.5839204788208, "loss": 0.5109, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.556188583374023, "rewards/margins": 1.0277318954467773, "rewards/rejected": -9.5839204788208, "semantic_entropy": 0.004027285613119602, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 15.73972180449081, "learning_rate": 5.0545080135113e-07, "logits/chosen": 0.6703477501869202, "logits/rejected": 0.7229039669036865, "logps/chosen": -8.628973007202148, "logps/rejected": -9.581435203552246, "loss": 0.5456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.628973007202148, "rewards/margins": 0.9524634480476379, "rewards/rejected": -9.581435203552246, "semantic_entropy": 0.003708144649863243, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 23.480077098467, "learning_rate": 5.038934673110316e-07, "logits/chosen": 0.6456252336502075, "logits/rejected": 0.7398085594177246, "logps/chosen": -8.677080154418945, "logps/rejected": -9.685178756713867, "loss": 0.5225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.677080154418945, "rewards/margins": 1.0080986022949219, "rewards/rejected": -9.685178756713867, "semantic_entropy": 0.0029380209743976593, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 15.433407858304248, "learning_rate": 5.023360954964963e-07, "logits/chosen": 0.6237664222717285, "logits/rejected": 0.6907469630241394, "logps/chosen": -8.563250541687012, "logps/rejected": -9.583332061767578, "loss": 0.4477, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.563250541687012, "rewards/margins": 1.0200810432434082, "rewards/rejected": -9.583332061767578, "semantic_entropy": 0.0032678351271897554, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 15.623034117961172, "learning_rate": 5.007787010171524e-07, "logits/chosen": 0.5593664050102234, "logits/rejected": 0.6882971525192261, "logps/chosen": -8.571015357971191, "logps/rejected": -9.626784324645996, "loss": 0.4314, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.571015357971191, "rewards/margins": 1.055769681930542, "rewards/rejected": -9.626784324645996, "semantic_entropy": 0.003013583132997155, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 19.95888030099492, "learning_rate": 4.992212989828477e-07, "logits/chosen": 0.6781376004219055, "logits/rejected": 0.7037637233734131, "logps/chosen": -8.676929473876953, "logps/rejected": -9.475358009338379, "loss": 0.523, "rewards/accuracies": 0.75, "rewards/chosen": -8.676929473876953, "rewards/margins": 0.7984285354614258, "rewards/rejected": -9.475358009338379, "semantic_entropy": 0.003011090215295553, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 23.790232395129802, "learning_rate": 4.976639045035036e-07, "logits/chosen": 0.6791437268257141, "logits/rejected": 0.7153705358505249, "logps/chosen": -8.594088554382324, "logps/rejected": -9.416778564453125, "loss": 0.5839, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.594088554382324, "rewards/margins": 0.8226897120475769, "rewards/rejected": -9.416778564453125, "semantic_entropy": 0.0035625225864350796, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 19.641758817894523, "learning_rate": 4.961065326889683e-07, "logits/chosen": 0.6901504397392273, "logits/rejected": 0.7671633958816528, "logps/chosen": -8.59467601776123, "logps/rejected": -9.425036430358887, "loss": 0.52, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.59467601776123, "rewards/margins": 0.8303607702255249, "rewards/rejected": -9.425036430358887, "semantic_entropy": 0.003136052517220378, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 21.418887952150815, "learning_rate": 4.9454919864887e-07, "logits/chosen": 0.6037132143974304, "logits/rejected": 0.7032198905944824, "logps/chosen": -8.467050552368164, "logps/rejected": -9.3862886428833, "loss": 0.52, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.467050552368164, "rewards/margins": 0.9192383885383606, "rewards/rejected": -9.3862886428833, "semantic_entropy": 0.0032492957543581724, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 23.776890869160376, "learning_rate": 4.929919174924701e-07, "logits/chosen": 0.6591798663139343, "logits/rejected": 0.7749283909797668, "logps/chosen": -8.419390678405762, "logps/rejected": -9.24329948425293, "loss": 0.5334, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.419390678405762, "rewards/margins": 0.8239078521728516, "rewards/rejected": -9.24329948425293, "semantic_entropy": 0.003245703876018524, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 14.314831569908868, "learning_rate": 4.914347043285177e-07, "logits/chosen": 0.695237398147583, "logits/rejected": 0.7862176299095154, "logps/chosen": -8.383251190185547, "logps/rejected": -9.337328910827637, "loss": 0.474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.383251190185547, "rewards/margins": 0.954079270362854, "rewards/rejected": -9.337328910827637, "semantic_entropy": 0.0036495565436780453, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 17.93218545649754, "learning_rate": 4.898775742651013e-07, "logits/chosen": 0.6778665781021118, "logits/rejected": 0.7646596431732178, "logps/chosen": -8.393171310424805, "logps/rejected": -9.438430786132812, "loss": 0.4243, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.393171310424805, "rewards/margins": 1.0452605485916138, "rewards/rejected": -9.438430786132812, "semantic_entropy": 0.0038520165253430605, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 11.631955236946794, "learning_rate": 4.883205424095037e-07, "logits/chosen": 0.6586011648178101, "logits/rejected": 0.7384502291679382, "logps/chosen": -8.368680000305176, "logps/rejected": -9.391559600830078, "loss": 0.4636, "rewards/accuracies": 0.78125, "rewards/chosen": -8.368680000305176, "rewards/margins": 1.0228804349899292, "rewards/rejected": -9.391559600830078, "semantic_entropy": 0.0038888491690158844, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 17.817638170727182, "learning_rate": 4.86763623868055e-07, "logits/chosen": 0.7192140817642212, "logits/rejected": 0.7774937152862549, "logps/chosen": -8.442755699157715, "logps/rejected": -9.343069076538086, "loss": 0.5072, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.442755699157715, "rewards/margins": 0.9003141522407532, "rewards/rejected": -9.343069076538086, "semantic_entropy": 0.0035094446502625942, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 15.880665450221413, "learning_rate": 4.852068337459856e-07, "logits/chosen": 0.7191354036331177, "logits/rejected": 0.788988471031189, "logps/chosen": -8.542181015014648, "logps/rejected": -9.512245178222656, "loss": 0.4687, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.542181015014648, "rewards/margins": 0.9700649380683899, "rewards/rejected": -9.512245178222656, "semantic_entropy": 0.0031049910467118025, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 20.022219719640713, "learning_rate": 4.8365018714728e-07, "logits/chosen": 0.798575222492218, "logits/rejected": 0.8416634798049927, "logps/chosen": -8.635334968566895, "logps/rejected": -9.463285446166992, "loss": 0.5202, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.635334968566895, "rewards/margins": 0.8279510736465454, "rewards/rejected": -9.463285446166992, "semantic_entropy": 0.0029447092674672604, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 22.26373435182509, "learning_rate": 4.820936991745304e-07, "logits/chosen": 0.6276187896728516, "logits/rejected": 0.6913308501243591, "logps/chosen": -8.587217330932617, "logps/rejected": -9.444761276245117, "loss": 0.5068, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.587217330932617, "rewards/margins": 0.8575426936149597, "rewards/rejected": -9.444761276245117, "semantic_entropy": 0.003062673145905137, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 26.04794148992061, "learning_rate": 4.8053738492879e-07, "logits/chosen": 0.6948825120925903, "logits/rejected": 0.7602331042289734, "logps/chosen": -8.406000137329102, "logps/rejected": -9.530774116516113, "loss": 0.4554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.406000137329102, "rewards/margins": 1.1247742176055908, "rewards/rejected": -9.530774116516113, "semantic_entropy": 0.0036121797747910023, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 23.24113283268114, "learning_rate": 4.789812595094265e-07, "logits/chosen": 0.6636757254600525, "logits/rejected": 0.7241615653038025, "logps/chosen": -8.501133918762207, "logps/rejected": -9.556479454040527, "loss": 0.4467, "rewards/accuracies": 0.78125, "rewards/chosen": -8.501133918762207, "rewards/margins": 1.0553454160690308, "rewards/rejected": -9.556479454040527, "semantic_entropy": 0.00418940931558609, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 17.478492942232236, "learning_rate": 4.774253380139752e-07, "logits/chosen": 0.6438261270523071, "logits/rejected": 0.7361315488815308, "logps/chosen": -8.412601470947266, "logps/rejected": -9.485505104064941, "loss": 0.4467, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.412601470947266, "rewards/margins": 1.0729031562805176, "rewards/rejected": -9.485505104064941, "semantic_entropy": 0.0037474199198186398, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 19.822571481610865, "learning_rate": 4.758696355379936e-07, "logits/chosen": 0.7401809692382812, "logits/rejected": 0.7346007227897644, "logps/chosen": -8.39743423461914, "logps/rejected": -9.354679107666016, "loss": 0.4803, "rewards/accuracies": 0.78125, "rewards/chosen": -8.39743423461914, "rewards/margins": 0.9572445154190063, "rewards/rejected": -9.354679107666016, "semantic_entropy": 0.004037821665406227, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 18.508878104944426, "learning_rate": 4.743141671749138e-07, "logits/chosen": 0.6463350057601929, "logits/rejected": 0.7294069528579712, "logps/chosen": -8.58276081085205, "logps/rejected": -9.354612350463867, "loss": 0.5592, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.58276081085205, "rewards/margins": 0.7718508243560791, "rewards/rejected": -9.354612350463867, "semantic_entropy": 0.0035912543535232544, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 19.828630412175407, "learning_rate": 4.727589480158968e-07, "logits/chosen": 0.6823207139968872, "logits/rejected": 0.7240070104598999, "logps/chosen": -8.653319358825684, "logps/rejected": -9.661191940307617, "loss": 0.4801, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.653319358825684, "rewards/margins": 1.0078718662261963, "rewards/rejected": -9.661191940307617, "semantic_entropy": 0.0033484199084341526, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 20.43246886248836, "learning_rate": 4.712039931496855e-07, "logits/chosen": 0.6765194535255432, "logits/rejected": 0.7426118850708008, "logps/chosen": -8.664289474487305, "logps/rejected": -9.393746376037598, "loss": 0.5722, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.664289474487305, "rewards/margins": 0.7294565439224243, "rewards/rejected": -9.393746376037598, "semantic_entropy": 0.003087881486862898, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 21.003656385946787, "learning_rate": 4.6964931766245905e-07, "logits/chosen": 0.7278314828872681, "logits/rejected": 0.7725498080253601, "logps/chosen": -8.796308517456055, "logps/rejected": -9.755891799926758, "loss": 0.4998, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.796308517456055, "rewards/margins": 0.9595831036567688, "rewards/rejected": -9.755891799926758, "semantic_entropy": 0.002779710106551647, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 21.709608881311866, "learning_rate": 4.6809493663768575e-07, "logits/chosen": 0.6481348276138306, "logits/rejected": 0.6695024967193604, "logps/chosen": -8.799505233764648, "logps/rejected": -9.460579872131348, "loss": 0.5856, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -8.799505233764648, "rewards/margins": 0.6610761880874634, "rewards/rejected": -9.460579872131348, "semantic_entropy": 0.0028663822449743748, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 16.96070709578509, "learning_rate": 4.6654086515597716e-07, "logits/chosen": 0.59629225730896, "logits/rejected": 0.6733515858650208, "logps/chosen": -8.772279739379883, "logps/rejected": -9.883055686950684, "loss": 0.4559, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.772279739379883, "rewards/margins": 1.1107757091522217, "rewards/rejected": -9.883055686950684, "semantic_entropy": 0.0028946802485734224, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 15.2495463629594, "learning_rate": 4.6498711829494154e-07, "logits/chosen": 0.6147344708442688, "logits/rejected": 0.6999740600585938, "logps/chosen": -8.856141090393066, "logps/rejected": -9.772600173950195, "loss": 0.5042, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.856141090393066, "rewards/margins": 0.916458010673523, "rewards/rejected": -9.772600173950195, "semantic_entropy": 0.002791165839880705, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 17.706370672383937, "learning_rate": 4.6343371112903777e-07, "logits/chosen": 0.7594738006591797, "logits/rejected": 0.8430054783821106, "logps/chosen": -8.928936958312988, "logps/rejected": -9.845270156860352, "loss": 0.5524, "rewards/accuracies": 0.65625, "rewards/chosen": -8.928936958312988, "rewards/margins": 0.916333019733429, "rewards/rejected": -9.845270156860352, "semantic_entropy": 0.002889876952394843, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.8485396504402161, "eval_logits/rejected": 0.9051938652992249, "eval_logps/chosen": -8.875685691833496, "eval_logps/rejected": -9.834607124328613, "eval_loss": 0.5206592679023743, "eval_rewards/accuracies": 0.716617226600647, "eval_rewards/chosen": -8.875685691833496, "eval_rewards/margins": 0.9589214324951172, "eval_rewards/rejected": -9.834607124328613, "eval_runtime": 35.3345, "eval_samples_per_second": 38.065, "eval_semantic_entropy": 0.0029725246131420135, "eval_steps_per_second": 9.537, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 16.489166146612195, "learning_rate": 4.618806587294291e-07, "logits/chosen": 0.6345168948173523, "logits/rejected": 0.724500834941864, "logps/chosen": -8.844565391540527, "logps/rejected": -9.868181228637695, "loss": 0.491, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.844565391540527, "rewards/margins": 1.023616075515747, "rewards/rejected": -9.868181228637695, "semantic_entropy": 0.0029267659410834312, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 20.284904352984434, "learning_rate": 4.603279761638365e-07, "logits/chosen": 0.6574803590774536, "logits/rejected": 0.7301944494247437, "logps/chosen": -8.73315143585205, "logps/rejected": -9.601076126098633, "loss": 0.5384, "rewards/accuracies": 0.71875, "rewards/chosen": -8.73315143585205, "rewards/margins": 0.8679240942001343, "rewards/rejected": -9.601076126098633, "semantic_entropy": 0.003197681624442339, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 18.178656885884827, "learning_rate": 4.5877567849639315e-07, "logits/chosen": 0.7295519709587097, "logits/rejected": 0.775715708732605, "logps/chosen": -8.844693183898926, "logps/rejected": -9.844103813171387, "loss": 0.4747, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.844693183898926, "rewards/margins": 0.9994112253189087, "rewards/rejected": -9.844103813171387, "semantic_entropy": 0.003269757376983762, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 18.18536875280265, "learning_rate": 4.572237807874979e-07, "logits/chosen": 0.7071816325187683, "logits/rejected": 0.8377809524536133, "logps/chosen": -9.233766555786133, "logps/rejected": -10.121223449707031, "loss": 0.5734, "rewards/accuracies": 0.6875, "rewards/chosen": -9.233766555786133, "rewards/margins": 0.887457549571991, "rewards/rejected": -10.121223449707031, "semantic_entropy": 0.0021587200462818146, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 19.728824808262466, "learning_rate": 4.5567229809366895e-07, "logits/chosen": 0.7191265225410461, "logits/rejected": 0.7812397480010986, "logps/chosen": -8.780452728271484, "logps/rejected": -9.705193519592285, "loss": 0.5172, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.780452728271484, "rewards/margins": 0.9247404932975769, "rewards/rejected": -9.705193519592285, "semantic_entropy": 0.0030050217173993587, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 22.902320646291393, "learning_rate": 4.541212454673984e-07, "logits/chosen": 0.7195814847946167, "logits/rejected": 0.7792760133743286, "logps/chosen": -9.003668785095215, "logps/rejected": -10.163053512573242, "loss": 0.4755, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.003668785095215, "rewards/margins": 1.159385085105896, "rewards/rejected": -10.163053512573242, "semantic_entropy": 0.0027733384631574154, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 21.27644304238722, "learning_rate": 4.525706379570055e-07, "logits/chosen": 0.754095196723938, "logits/rejected": 0.8056744337081909, "logps/chosen": -8.933822631835938, "logps/rejected": -9.91698169708252, "loss": 0.5001, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.933822631835938, "rewards/margins": 0.983159065246582, "rewards/rejected": -9.91698169708252, "semantic_entropy": 0.002852677833288908, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 16.305218184451377, "learning_rate": 4.510204906064911e-07, "logits/chosen": 0.7781286239624023, "logits/rejected": 0.8381627798080444, "logps/chosen": -9.009490013122559, "logps/rejected": -10.12246036529541, "loss": 0.4383, "rewards/accuracies": 0.8125, "rewards/chosen": -9.009490013122559, "rewards/margins": 1.1129701137542725, "rewards/rejected": -10.12246036529541, "semantic_entropy": 0.0021688812412321568, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 21.167999522279032, "learning_rate": 4.4947081845539177e-07, "logits/chosen": 0.7233031988143921, "logits/rejected": 0.7836328148841858, "logps/chosen": -9.125692367553711, "logps/rejected": -10.023492813110352, "loss": 0.5164, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -9.125692367553711, "rewards/margins": 0.8977994918823242, "rewards/rejected": -10.023492813110352, "semantic_entropy": 0.0024925144389271736, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 17.847444177521478, "learning_rate": 4.479216365386333e-07, "logits/chosen": 0.7969452142715454, "logits/rejected": 0.8838433027267456, "logps/chosen": -9.0504789352417, "logps/rejected": -10.168962478637695, "loss": 0.445, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.0504789352417, "rewards/margins": 1.1184842586517334, "rewards/rejected": -10.168962478637695, "semantic_entropy": 0.00228295405395329, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 13.863230971283963, "learning_rate": 4.4637295988638555e-07, "logits/chosen": 0.7870410680770874, "logits/rejected": 0.8611448407173157, "logps/chosen": -8.87813663482666, "logps/rejected": -9.888373374938965, "loss": 0.4735, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.87813663482666, "rewards/margins": 1.0102384090423584, "rewards/rejected": -9.888373374938965, "semantic_entropy": 0.00256515690125525, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 23.662117895070335, "learning_rate": 4.4482480352391623e-07, "logits/chosen": 0.6543598175048828, "logits/rejected": 0.7426427006721497, "logps/chosen": -9.041504859924316, "logps/rejected": -10.04463005065918, "loss": 0.4806, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.041504859924316, "rewards/margins": 1.0031250715255737, "rewards/rejected": -10.04463005065918, "semantic_entropy": 0.0023853727616369724, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 24.256153218206705, "learning_rate": 4.4327718247144507e-07, "logits/chosen": 0.7513245940208435, "logits/rejected": 0.8328276872634888, "logps/chosen": -9.090994834899902, "logps/rejected": -10.088811874389648, "loss": 0.4761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.090994834899902, "rewards/margins": 0.9978184700012207, "rewards/rejected": -10.088811874389648, "semantic_entropy": 0.002217040164396167, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 26.314796306829628, "learning_rate": 4.417301117439984e-07, "logits/chosen": 0.7460024356842041, "logits/rejected": 0.8103092312812805, "logps/chosen": -9.169168472290039, "logps/rejected": -10.078702926635742, "loss": 0.5253, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.169168472290039, "rewards/margins": 0.9095350503921509, "rewards/rejected": -10.078702926635742, "semantic_entropy": 0.0022672966588288546, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 18.707979583798863, "learning_rate": 4.401836063512631e-07, "logits/chosen": 0.7222810983657837, "logits/rejected": 0.8605899810791016, "logps/chosen": -8.943084716796875, "logps/rejected": -10.04680347442627, "loss": 0.4723, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.943084716796875, "rewards/margins": 1.1037187576293945, "rewards/rejected": -10.04680347442627, "semantic_entropy": 0.002709039021283388, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 24.168153990326203, "learning_rate": 4.386376812974413e-07, "logits/chosen": 0.6883140802383423, "logits/rejected": 0.7442909479141235, "logps/chosen": -8.88626766204834, "logps/rejected": -9.887288093566895, "loss": 0.4843, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -8.88626766204834, "rewards/margins": 1.0010201930999756, "rewards/rejected": -9.887288093566895, "semantic_entropy": 0.002547713927924633, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 21.306584019538054, "learning_rate": 4.370923515811048e-07, "logits/chosen": 0.7414734363555908, "logits/rejected": 0.8382769823074341, "logps/chosen": -9.08434009552002, "logps/rejected": -10.091033935546875, "loss": 0.4818, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.08434009552002, "rewards/margins": 1.0066949129104614, "rewards/rejected": -10.091033935546875, "semantic_entropy": 0.002196715446189046, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 17.567070937529156, "learning_rate": 4.35547632195049e-07, "logits/chosen": 0.7264934778213501, "logits/rejected": 0.8058233261108398, "logps/chosen": -8.905702590942383, "logps/rejected": -9.89527416229248, "loss": 0.456, "rewards/accuracies": 0.78125, "rewards/chosen": -8.905702590942383, "rewards/margins": 0.9895727038383484, "rewards/rejected": -9.89527416229248, "semantic_entropy": 0.0023129256442189217, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 21.091679239749574, "learning_rate": 4.340035381261484e-07, "logits/chosen": 0.7000614404678345, "logits/rejected": 0.7599838972091675, "logps/chosen": -9.041738510131836, "logps/rejected": -10.077522277832031, "loss": 0.4989, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.041738510131836, "rewards/margins": 1.0357847213745117, "rewards/rejected": -10.077522277832031, "semantic_entropy": 0.002556400140747428, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 19.4750798931357, "learning_rate": 4.324600843552104e-07, "logits/chosen": 0.61224764585495, "logits/rejected": 0.694617509841919, "logps/chosen": -9.045601844787598, "logps/rejected": -10.111780166625977, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -9.045601844787598, "rewards/margins": 1.0661789178848267, "rewards/rejected": -10.111780166625977, "semantic_entropy": 0.002750884275883436, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 27.700841135900863, "learning_rate": 4.309172858568302e-07, "logits/chosen": 0.6138121485710144, "logits/rejected": 0.7267636060714722, "logps/chosen": -8.853018760681152, "logps/rejected": -9.896512985229492, "loss": 0.4664, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.853018760681152, "rewards/margins": 1.0434927940368652, "rewards/rejected": -9.896512985229492, "semantic_entropy": 0.002849545329809189, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 17.137464902365757, "learning_rate": 4.293751575992455e-07, "logits/chosen": 0.7429224848747253, "logits/rejected": 0.792006254196167, "logps/chosen": -8.867963790893555, "logps/rejected": -9.816696166992188, "loss": 0.4852, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.867963790893555, "rewards/margins": 0.9487320184707642, "rewards/rejected": -9.816696166992188, "semantic_entropy": 0.0024321440141648054, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 23.093031626174174, "learning_rate": 4.278337145441916e-07, "logits/chosen": 0.703718900680542, "logits/rejected": 0.7882632613182068, "logps/chosen": -8.92485237121582, "logps/rejected": -9.829282760620117, "loss": 0.4997, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.92485237121582, "rewards/margins": 0.904431939125061, "rewards/rejected": -9.829282760620117, "semantic_entropy": 0.00211041746661067, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 14.81174004133826, "learning_rate": 4.262929716467556e-07, "logits/chosen": 0.7307204008102417, "logits/rejected": 0.828132152557373, "logps/chosen": -8.699949264526367, "logps/rejected": -9.868879318237305, "loss": 0.4528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.699949264526367, "rewards/margins": 1.1689304113388062, "rewards/rejected": -9.868879318237305, "semantic_entropy": 0.0027522039599716663, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 21.68742926157539, "learning_rate": 4.247529438552321e-07, "logits/chosen": 0.6795674562454224, "logits/rejected": 0.7630687355995178, "logps/chosen": -8.83338451385498, "logps/rejected": -9.718835830688477, "loss": 0.5331, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.83338451385498, "rewards/margins": 0.8854507207870483, "rewards/rejected": -9.718835830688477, "semantic_entropy": 0.0027334585320204496, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 17.78803459405486, "learning_rate": 4.232136461109773e-07, "logits/chosen": 0.6920473575592041, "logits/rejected": 0.7552872896194458, "logps/chosen": -8.73144245147705, "logps/rejected": -9.89186954498291, "loss": 0.4425, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.73144245147705, "rewards/margins": 1.1604268550872803, "rewards/rejected": -9.89186954498291, "semantic_entropy": 0.0029787137173116207, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 26.15974232065996, "learning_rate": 4.216750933482646e-07, "logits/chosen": 0.6749182939529419, "logits/rejected": 0.7685472965240479, "logps/chosen": -8.99049186706543, "logps/rejected": -9.847522735595703, "loss": 0.5483, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.99049186706543, "rewards/margins": 0.8570305705070496, "rewards/rejected": -9.847522735595703, "semantic_entropy": 0.002466335194185376, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 27.88902855871644, "learning_rate": 4.2013730049413986e-07, "logits/chosen": 0.7373770475387573, "logits/rejected": 0.8123876452445984, "logps/chosen": -8.785151481628418, "logps/rejected": -9.975650787353516, "loss": 0.4473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.785151481628418, "rewards/margins": 1.1904983520507812, "rewards/rejected": -9.975650787353516, "semantic_entropy": 0.0027192619163542986, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 18.711785224251184, "learning_rate": 4.1860028246827594e-07, "logits/chosen": 0.7438098788261414, "logits/rejected": 0.8496102094650269, "logps/chosen": -8.687559127807617, "logps/rejected": -9.705583572387695, "loss": 0.4863, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.687559127807617, "rewards/margins": 1.0180258750915527, "rewards/rejected": -9.705583572387695, "semantic_entropy": 0.0030276733450591564, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 14.740013723796874, "learning_rate": 4.170640541828285e-07, "logits/chosen": 0.6757484674453735, "logits/rejected": 0.7701447606086731, "logps/chosen": -8.937789916992188, "logps/rejected": -9.954288482666016, "loss": 0.4742, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.937789916992188, "rewards/margins": 1.0164979696273804, "rewards/rejected": -9.954288482666016, "semantic_entropy": 0.0024529777001589537, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 23.48077638997515, "learning_rate": 4.1552863054229116e-07, "logits/chosen": 0.7250600457191467, "logits/rejected": 0.7637051343917847, "logps/chosen": -8.986780166625977, "logps/rejected": -9.95530891418457, "loss": 0.5204, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.986780166625977, "rewards/margins": 0.968528151512146, "rewards/rejected": -9.95530891418457, "semantic_entropy": 0.0026002321392297745, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 21.480655584717656, "learning_rate": 4.139940264433508e-07, "logits/chosen": 0.6162451505661011, "logits/rejected": 0.6867518424987793, "logps/chosen": -8.727703094482422, "logps/rejected": -9.817054748535156, "loss": 0.485, "rewards/accuracies": 0.71875, "rewards/chosen": -8.727703094482422, "rewards/margins": 1.0893512964248657, "rewards/rejected": -9.817054748535156, "semantic_entropy": 0.0029442054219543934, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 18.329533144366998, "learning_rate": 4.1246025677474303e-07, "logits/chosen": 0.6584054231643677, "logits/rejected": 0.7416545152664185, "logps/chosen": -8.870896339416504, "logps/rejected": -9.782048225402832, "loss": 0.4965, "rewards/accuracies": 0.8125, "rewards/chosen": -8.870896339416504, "rewards/margins": 0.9111523628234863, "rewards/rejected": -9.782048225402832, "semantic_entropy": 0.0025828261859714985, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 20.53246303343791, "learning_rate": 4.10927336417108e-07, "logits/chosen": 0.699885368347168, "logits/rejected": 0.7778645753860474, "logps/chosen": -9.00536060333252, "logps/rejected": -9.712576866149902, "loss": 0.6029, "rewards/accuracies": 0.71875, "rewards/chosen": -9.00536060333252, "rewards/margins": 0.7072166204452515, "rewards/rejected": -9.712576866149902, "semantic_entropy": 0.0022258516401052475, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 17.92382157104711, "learning_rate": 4.093952802428457e-07, "logits/chosen": 0.7124849557876587, "logits/rejected": 0.773395836353302, "logps/chosen": -9.165143013000488, "logps/rejected": -10.00512409210205, "loss": 0.5968, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -9.165143013000488, "rewards/margins": 0.8399818539619446, "rewards/rejected": -10.00512409210205, "semantic_entropy": 0.0018791807815432549, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 16.031873167807106, "learning_rate": 4.0786410311597184e-07, "logits/chosen": 0.6675196886062622, "logits/rejected": 0.7558413743972778, "logps/chosen": -8.87452220916748, "logps/rejected": -9.87469482421875, "loss": 0.5014, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.87452220916748, "rewards/margins": 1.0001723766326904, "rewards/rejected": -9.87469482421875, "semantic_entropy": 0.0024310979060828686, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 17.18330270756532, "learning_rate": 4.063338198919737e-07, "logits/chosen": 0.6833704710006714, "logits/rejected": 0.6954981684684753, "logps/chosen": -8.974740982055664, "logps/rejected": -9.834104537963867, "loss": 0.515, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.974740982055664, "rewards/margins": 0.8593646883964539, "rewards/rejected": -9.834104537963867, "semantic_entropy": 0.002467888640239835, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 30.761929745103753, "learning_rate": 4.0480444541766575e-07, "logits/chosen": 0.7065908908843994, "logits/rejected": 0.761638343334198, "logps/chosen": -9.091318130493164, "logps/rejected": -9.837724685668945, "loss": 0.5966, "rewards/accuracies": 0.6875, "rewards/chosen": -9.091318130493164, "rewards/margins": 0.7464063763618469, "rewards/rejected": -9.837724685668945, "semantic_entropy": 0.002267292933538556, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 17.437367219946186, "learning_rate": 4.0327599453104606e-07, "logits/chosen": 0.6314225792884827, "logits/rejected": 0.7021452784538269, "logps/chosen": -8.934675216674805, "logps/rejected": -9.954093933105469, "loss": 0.4614, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.934675216674805, "rewards/margins": 1.019417405128479, "rewards/rejected": -9.954093933105469, "semantic_entropy": 0.002328323433175683, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 23.666717824573407, "learning_rate": 4.017484820611514e-07, "logits/chosen": 0.6827374696731567, "logits/rejected": 0.7666773796081543, "logps/chosen": -9.024335861206055, "logps/rejected": -9.977819442749023, "loss": 0.499, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.024335861206055, "rewards/margins": 0.9534839391708374, "rewards/rejected": -9.977819442749023, "semantic_entropy": 0.002623113337904215, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 19.47814600029652, "learning_rate": 4.002219228279148e-07, "logits/chosen": 0.6472792029380798, "logits/rejected": 0.7226368188858032, "logps/chosen": -9.10517406463623, "logps/rejected": -9.983365058898926, "loss": 0.4792, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.10517406463623, "rewards/margins": 0.878190815448761, "rewards/rejected": -9.983365058898926, "semantic_entropy": 0.001885834732092917, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 15.863322204582039, "learning_rate": 3.9869633164202045e-07, "logits/chosen": 0.6499922871589661, "logits/rejected": 0.7298166751861572, "logps/chosen": -9.109966278076172, "logps/rejected": -10.095043182373047, "loss": 0.4607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.109966278076172, "rewards/margins": 0.9850764274597168, "rewards/rejected": -10.095043182373047, "semantic_entropy": 0.00205561937764287, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 20.471803353130138, "learning_rate": 3.9717172330476077e-07, "logits/chosen": 0.6554244756698608, "logits/rejected": 0.7187341451644897, "logps/chosen": -8.939603805541992, "logps/rejected": -9.954214096069336, "loss": 0.4732, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.939603805541992, "rewards/margins": 1.0146093368530273, "rewards/rejected": -9.954214096069336, "semantic_entropy": 0.002255493775010109, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 17.77008986004475, "learning_rate": 3.956481126078927e-07, "logits/chosen": 0.7610489726066589, "logits/rejected": 0.8220788836479187, "logps/chosen": -8.944317817687988, "logps/rejected": -9.942276000976562, "loss": 0.5332, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.944317817687988, "rewards/margins": 0.9979581832885742, "rewards/rejected": -9.942276000976562, "semantic_entropy": 0.0026616621762514114, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 17.655564242212915, "learning_rate": 3.941255143334937e-07, "logits/chosen": 0.6452963948249817, "logits/rejected": 0.6714794039726257, "logps/chosen": -9.091032028198242, "logps/rejected": -10.047563552856445, "loss": 0.4901, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.091032028198242, "rewards/margins": 0.9565309286117554, "rewards/rejected": -10.047563552856445, "semantic_entropy": 0.0020148297771811485, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 21.687629209942305, "learning_rate": 3.9260394325381895e-07, "logits/chosen": 0.6120941638946533, "logits/rejected": 0.6991773843765259, "logps/chosen": -8.957076072692871, "logps/rejected": -10.24323844909668, "loss": 0.4464, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.957076072692871, "rewards/margins": 1.286162257194519, "rewards/rejected": -10.24323844909668, "semantic_entropy": 0.002235526219010353, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 20.343261070644665, "learning_rate": 3.9108341413115784e-07, "logits/chosen": 0.6617427468299866, "logits/rejected": 0.7312533259391785, "logps/chosen": -9.039822578430176, "logps/rejected": -10.031728744506836, "loss": 0.4638, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.039822578430176, "rewards/margins": 0.9919074177742004, "rewards/rejected": -10.031728744506836, "semantic_entropy": 0.002043725224211812, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 23.236450227403648, "learning_rate": 3.895639417176905e-07, "logits/chosen": 0.6440222859382629, "logits/rejected": 0.7037031054496765, "logps/chosen": -9.063264846801758, "logps/rejected": -10.010323524475098, "loss": 0.5479, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -9.063264846801758, "rewards/margins": 0.9470599889755249, "rewards/rejected": -10.010323524475098, "semantic_entropy": 0.0022424368653446436, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 19.62330937214821, "learning_rate": 3.8804554075534497e-07, "logits/chosen": 0.6262876987457275, "logits/rejected": 0.7453981041908264, "logps/chosen": -8.971453666687012, "logps/rejected": -10.038634300231934, "loss": 0.5028, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -8.971453666687012, "rewards/margins": 1.067180871963501, "rewards/rejected": -10.038634300231934, "semantic_entropy": 0.002401644829660654, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 20.685899363455015, "learning_rate": 3.8652822597565403e-07, "logits/chosen": 0.6409908533096313, "logits/rejected": 0.7288905382156372, "logps/chosen": -9.046746253967285, "logps/rejected": -10.134294509887695, "loss": 0.4441, "rewards/accuracies": 0.78125, "rewards/chosen": -9.046746253967285, "rewards/margins": 1.0875482559204102, "rewards/rejected": -10.134294509887695, "semantic_entropy": 0.0022094310261309147, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 21.553543305017367, "learning_rate": 3.850120120996123e-07, "logits/chosen": 0.6723691821098328, "logits/rejected": 0.797301173210144, "logps/chosen": -9.249448776245117, "logps/rejected": -10.260098457336426, "loss": 0.525, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.249448776245117, "rewards/margins": 1.0106487274169922, "rewards/rejected": -10.260098457336426, "semantic_entropy": 0.0019731963984668255, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 18.840448032000573, "learning_rate": 3.8349691383753356e-07, "logits/chosen": 0.7298885583877563, "logits/rejected": 0.7910041213035583, "logps/chosen": -8.926676750183105, "logps/rejected": -9.932170867919922, "loss": 0.4829, "rewards/accuracies": 0.75, "rewards/chosen": -8.926676750183105, "rewards/margins": 1.0054935216903687, "rewards/rejected": -9.932170867919922, "semantic_entropy": 0.0026156664825975895, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 22.35639759207699, "learning_rate": 3.819829458889078e-07, "logits/chosen": 0.6428291201591492, "logits/rejected": 0.6888445615768433, "logps/chosen": -9.04511833190918, "logps/rejected": -10.008955001831055, "loss": 0.5023, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.04511833190918, "rewards/margins": 0.9638371467590332, "rewards/rejected": -10.008955001831055, "semantic_entropy": 0.002010942902415991, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 18.756301563038658, "learning_rate": 3.804701229422585e-07, "logits/chosen": 0.6267444491386414, "logits/rejected": 0.7092264890670776, "logps/chosen": -8.965785026550293, "logps/rejected": -10.16661548614502, "loss": 0.4493, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.965785026550293, "rewards/margins": 1.2008302211761475, "rewards/rejected": -10.16661548614502, "semantic_entropy": 0.002212436404079199, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 20.509740408539155, "learning_rate": 3.789584596750007e-07, "logits/chosen": 0.644954264163971, "logits/rejected": 0.6691696047782898, "logps/chosen": -9.087681770324707, "logps/rejected": -10.077864646911621, "loss": 0.5039, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -9.087681770324707, "rewards/margins": 0.9901838302612305, "rewards/rejected": -10.077864646911621, "semantic_entropy": 0.0026786925736814737, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 21.56561893288092, "learning_rate": 3.77447970753298e-07, "logits/chosen": 0.6968456506729126, "logits/rejected": 0.7207155823707581, "logps/chosen": -9.34924602508545, "logps/rejected": -10.384513854980469, "loss": 0.5044, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.34924602508545, "rewards/margins": 1.0352654457092285, "rewards/rejected": -10.384513854980469, "semantic_entropy": 0.001824896433390677, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 27.331504916633676, "learning_rate": 3.7593867083192057e-07, "logits/chosen": 0.6225256323814392, "logits/rejected": 0.7147785425186157, "logps/chosen": -9.113728523254395, "logps/rejected": -10.127340316772461, "loss": 0.4956, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.113728523254395, "rewards/margins": 1.0136115550994873, "rewards/rejected": -10.127340316772461, "semantic_entropy": 0.002069010864943266, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 24.045050734996725, "learning_rate": 3.7443057455410276e-07, "logits/chosen": 0.7259531617164612, "logits/rejected": 0.767625093460083, "logps/chosen": -9.031217575073242, "logps/rejected": -10.065279006958008, "loss": 0.453, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.031217575073242, "rewards/margins": 1.034061312675476, "rewards/rejected": -10.065279006958008, "semantic_entropy": 0.002121392637491226, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 22.0898928139177, "learning_rate": 3.7292369655140145e-07, "logits/chosen": 0.6421200037002563, "logits/rejected": 0.7500838041305542, "logps/chosen": -9.124689102172852, "logps/rejected": -10.039737701416016, "loss": 0.4787, "rewards/accuracies": 0.78125, "rewards/chosen": -9.124689102172852, "rewards/margins": 0.915047824382782, "rewards/rejected": -10.039737701416016, "semantic_entropy": 0.0020720604807138443, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 17.368463245273844, "learning_rate": 3.714180514435534e-07, "logits/chosen": 0.6820253133773804, "logits/rejected": 0.767578125, "logps/chosen": -8.85074520111084, "logps/rejected": -9.993677139282227, "loss": 0.4724, "rewards/accuracies": 0.78125, "rewards/chosen": -8.85074520111084, "rewards/margins": 1.1429319381713867, "rewards/rejected": -9.993677139282227, "semantic_entropy": 0.0029044263064861298, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 26.111048656274622, "learning_rate": 3.6991365383833426e-07, "logits/chosen": 0.6787170767784119, "logits/rejected": 0.7552027702331543, "logps/chosen": -8.890897750854492, "logps/rejected": -9.884078025817871, "loss": 0.476, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -8.890897750854492, "rewards/margins": 0.9931808710098267, "rewards/rejected": -9.884078025817871, "semantic_entropy": 0.0023928822483867407, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 25.720344446595462, "learning_rate": 3.684105183314162e-07, "logits/chosen": 0.6534699201583862, "logits/rejected": 0.7134217619895935, "logps/chosen": -8.654134750366211, "logps/rejected": -9.688114166259766, "loss": 0.4572, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.654134750366211, "rewards/margins": 1.0339783430099487, "rewards/rejected": -9.688114166259766, "semantic_entropy": 0.0033304274547845125, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 26.719905564320833, "learning_rate": 3.669086595062263e-07, "logits/chosen": 0.6928398609161377, "logits/rejected": 0.7959692478179932, "logps/chosen": -8.998773574829102, "logps/rejected": -10.005119323730469, "loss": 0.4712, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.998773574829102, "rewards/margins": 1.0063453912734985, "rewards/rejected": -10.005119323730469, "semantic_entropy": 0.002207712968811393, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 18.310966271031408, "learning_rate": 3.654080919338056e-07, "logits/chosen": 0.6792198419570923, "logits/rejected": 0.7416011691093445, "logps/chosen": -9.011211395263672, "logps/rejected": -9.959385871887207, "loss": 0.5049, "rewards/accuracies": 0.71875, "rewards/chosen": -9.011211395263672, "rewards/margins": 0.9481745958328247, "rewards/rejected": -9.959385871887207, "semantic_entropy": 0.0023789291735738516, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 23.54294511956383, "learning_rate": 3.639088301726673e-07, "logits/chosen": 0.7045190334320068, "logits/rejected": 0.814649224281311, "logps/chosen": -9.096199035644531, "logps/rejected": -9.993253707885742, "loss": 0.5336, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.096199035644531, "rewards/margins": 0.8970546722412109, "rewards/rejected": -9.993253707885742, "semantic_entropy": 0.0019266394665464759, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 25.00575901129148, "learning_rate": 3.624108887686556e-07, "logits/chosen": 0.717838704586029, "logits/rejected": 0.7664039134979248, "logps/chosen": -9.02385139465332, "logps/rejected": -9.899523735046387, "loss": 0.4944, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.02385139465332, "rewards/margins": 0.8756723403930664, "rewards/rejected": -9.899523735046387, "semantic_entropy": 0.00239885738119483, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 14.206266277834583, "learning_rate": 3.6091428225480433e-07, "logits/chosen": 0.6777101755142212, "logits/rejected": 0.7591882944107056, "logps/chosen": -8.996365547180176, "logps/rejected": -10.051039695739746, "loss": 0.4775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.996365547180176, "rewards/margins": 1.0546749830245972, "rewards/rejected": -10.051039695739746, "semantic_entropy": 0.0023488677106797695, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 24.582046979640456, "learning_rate": 3.5941902515119674e-07, "logits/chosen": 0.6657333374023438, "logits/rejected": 0.7678895592689514, "logps/chosen": -9.090019226074219, "logps/rejected": -9.867273330688477, "loss": 0.5413, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.090019226074219, "rewards/margins": 0.7772535085678101, "rewards/rejected": -9.867273330688477, "semantic_entropy": 0.0021225649397820234, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 22.002900126766963, "learning_rate": 3.5792513196482373e-07, "logits/chosen": 0.6315397620201111, "logits/rejected": 0.7467874884605408, "logps/chosen": -8.85982894897461, "logps/rejected": -9.882316589355469, "loss": 0.4531, "rewards/accuracies": 0.8125, "rewards/chosen": -8.85982894897461, "rewards/margins": 1.0224884748458862, "rewards/rejected": -9.882316589355469, "semantic_entropy": 0.0025396724231541157, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 18.88844359088285, "learning_rate": 3.5643261718944346e-07, "logits/chosen": 0.7221022844314575, "logits/rejected": 0.779187798500061, "logps/chosen": -9.089310646057129, "logps/rejected": -9.891887664794922, "loss": 0.5786, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.089310646057129, "rewards/margins": 0.802577018737793, "rewards/rejected": -9.891887664794922, "semantic_entropy": 0.002080023754388094, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 14.948819514875911, "learning_rate": 3.5494149530544087e-07, "logits/chosen": 0.6752597093582153, "logits/rejected": 0.7371557354927063, "logps/chosen": -8.859451293945312, "logps/rejected": -9.911179542541504, "loss": 0.483, "rewards/accuracies": 0.75, "rewards/chosen": -8.859451293945312, "rewards/margins": 1.051727294921875, "rewards/rejected": -9.911179542541504, "semantic_entropy": 0.0026133800856769085, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 24.637227593137656, "learning_rate": 3.534517807796871e-07, "logits/chosen": 0.6935003995895386, "logits/rejected": 0.7413294315338135, "logps/chosen": -8.910869598388672, "logps/rejected": -9.782427787780762, "loss": 0.5241, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.910869598388672, "rewards/margins": 0.8715595006942749, "rewards/rejected": -9.782427787780762, "semantic_entropy": 0.002564162714406848, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 16.09008068793547, "learning_rate": 3.519634880653988e-07, "logits/chosen": 0.7049506902694702, "logits/rejected": 0.7636333703994751, "logps/chosen": -9.050897598266602, "logps/rejected": -10.201239585876465, "loss": 0.4495, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.050897598266602, "rewards/margins": 1.150342583656311, "rewards/rejected": -10.201239585876465, "semantic_entropy": 0.0020480218809098005, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 17.29460063325407, "learning_rate": 3.504766316019987e-07, "logits/chosen": 0.6761201024055481, "logits/rejected": 0.7888168096542358, "logps/chosen": -8.835293769836426, "logps/rejected": -9.917633056640625, "loss": 0.454, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.835293769836426, "rewards/margins": 1.0823395252227783, "rewards/rejected": -9.917633056640625, "semantic_entropy": 0.002739850664511323, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 15.951273378502073, "learning_rate": 3.489912258149745e-07, "logits/chosen": 0.7415227293968201, "logits/rejected": 0.8035387992858887, "logps/chosen": -8.881102561950684, "logps/rejected": -9.969578742980957, "loss": 0.4531, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.881102561950684, "rewards/margins": 1.0884764194488525, "rewards/rejected": -9.969578742980957, "semantic_entropy": 0.0023522416595369577, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 18.18795560922933, "learning_rate": 3.475072851157397e-07, "logits/chosen": 0.7050553560256958, "logits/rejected": 0.7514214515686035, "logps/chosen": -8.872556686401367, "logps/rejected": -9.893532752990723, "loss": 0.4645, "rewards/accuracies": 0.78125, "rewards/chosen": -8.872556686401367, "rewards/margins": 1.0209757089614868, "rewards/rejected": -9.893532752990723, "semantic_entropy": 0.0025408435612916946, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 15.738805690279383, "learning_rate": 3.460248239014936e-07, "logits/chosen": 0.7101159691810608, "logits/rejected": 0.7551933526992798, "logps/chosen": -9.055107116699219, "logps/rejected": -10.217333793640137, "loss": 0.4421, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.055107116699219, "rewards/margins": 1.1622273921966553, "rewards/rejected": -10.217333793640137, "semantic_entropy": 0.002511825645342469, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 19.41964183644154, "learning_rate": 3.4454385655508134e-07, "logits/chosen": 0.7462642788887024, "logits/rejected": 0.7571308016777039, "logps/chosen": -9.074853897094727, "logps/rejected": -9.919515609741211, "loss": 0.5562, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -9.074853897094727, "rewards/margins": 0.8446613550186157, "rewards/rejected": -9.919515609741211, "semantic_entropy": 0.002418497810140252, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 15.49567951743004, "learning_rate": 3.4306439744485447e-07, "logits/chosen": 0.6995843052864075, "logits/rejected": 0.7880675792694092, "logps/chosen": -9.252038955688477, "logps/rejected": -10.221213340759277, "loss": 0.5099, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.252038955688477, "rewards/margins": 0.969176173210144, "rewards/rejected": -10.221213340759277, "semantic_entropy": 0.0019108497072011232, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 21.391951740269207, "learning_rate": 3.415864609245322e-07, "logits/chosen": 0.7241548895835876, "logits/rejected": 0.7919793725013733, "logps/chosen": -9.234020233154297, "logps/rejected": -10.174389839172363, "loss": 0.5311, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.234020233154297, "rewards/margins": 0.9403679966926575, "rewards/rejected": -10.174389839172363, "semantic_entropy": 0.0019412841647863388, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.7883932590484619, "eval_logits/rejected": 0.8341716527938843, "eval_logps/chosen": -9.098273277282715, "eval_logps/rejected": -10.07473087310791, "eval_loss": 0.5169808268547058, "eval_rewards/accuracies": 0.7232937812805176, "eval_rewards/chosen": -9.098273277282715, "eval_rewards/margins": 0.9764575362205505, "eval_rewards/rejected": -10.07473087310791, "eval_runtime": 35.2413, "eval_samples_per_second": 38.165, "eval_semantic_entropy": 0.0023804251104593277, "eval_steps_per_second": 9.563, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 20.553397352111812, "learning_rate": 3.401100613330605e-07, "logits/chosen": 0.7208374738693237, "logits/rejected": 0.7372707724571228, "logps/chosen": -8.946023941040039, "logps/rejected": -9.88565731048584, "loss": 0.5125, "rewards/accuracies": 0.71875, "rewards/chosen": -8.946023941040039, "rewards/margins": 0.9396332502365112, "rewards/rejected": -9.88565731048584, "semantic_entropy": 0.0025065175723284483, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 14.899765335539588, "learning_rate": 3.3863521299447514e-07, "logits/chosen": 0.66487056016922, "logits/rejected": 0.7429142594337463, "logps/chosen": -8.882848739624023, "logps/rejected": -9.959342956542969, "loss": 0.4247, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.882848739624023, "rewards/margins": 1.0764933824539185, "rewards/rejected": -9.959342956542969, "semantic_entropy": 0.0027935917023569345, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 18.125423863526265, "learning_rate": 3.371619302177609e-07, "logits/chosen": 0.7205886840820312, "logits/rejected": 0.783849835395813, "logps/chosen": -9.081689834594727, "logps/rejected": -10.110027313232422, "loss": 0.493, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.081689834594727, "rewards/margins": 1.0283381938934326, "rewards/rejected": -10.110027313232422, "semantic_entropy": 0.0018970107194036245, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 22.214798729957145, "learning_rate": 3.3569022729671393e-07, "logits/chosen": 0.7102506160736084, "logits/rejected": 0.7653765678405762, "logps/chosen": -9.172645568847656, "logps/rejected": -10.060796737670898, "loss": 0.5162, "rewards/accuracies": 0.71875, "rewards/chosen": -9.172645568847656, "rewards/margins": 0.8881510496139526, "rewards/rejected": -10.060796737670898, "semantic_entropy": 0.002231413032859564, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 16.808694866109835, "learning_rate": 3.342201185098024e-07, "logits/chosen": 0.7016305923461914, "logits/rejected": 0.7090336084365845, "logps/chosen": -8.874353408813477, "logps/rejected": -9.903203010559082, "loss": 0.4595, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.874353408813477, "rewards/margins": 1.028850793838501, "rewards/rejected": -9.903203010559082, "semantic_entropy": 0.0032983936835080385, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 19.764545534044544, "learning_rate": 3.3275161812002807e-07, "logits/chosen": 0.6539800763130188, "logits/rejected": 0.6890634894371033, "logps/chosen": -8.991025924682617, "logps/rejected": -10.090238571166992, "loss": 0.4918, "rewards/accuracies": 0.75, "rewards/chosen": -8.991025924682617, "rewards/margins": 1.0992109775543213, "rewards/rejected": -10.090238571166992, "semantic_entropy": 0.002287736628204584, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 22.455179400576377, "learning_rate": 3.312847403747883e-07, "logits/chosen": 0.6598862409591675, "logits/rejected": 0.7323023676872253, "logps/chosen": -8.853995323181152, "logps/rejected": -10.00835132598877, "loss": 0.4441, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -8.853995323181152, "rewards/margins": 1.1543556451797485, "rewards/rejected": -10.00835132598877, "semantic_entropy": 0.0028118849731981754, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 20.333660341959458, "learning_rate": 3.2981949950573733e-07, "logits/chosen": 0.6398320198059082, "logits/rejected": 0.6992667317390442, "logps/chosen": -9.079477310180664, "logps/rejected": -10.012245178222656, "loss": 0.4811, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.079477310180664, "rewards/margins": 0.9327686429023743, "rewards/rejected": -10.012245178222656, "semantic_entropy": 0.002612376119941473, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 17.83037025874042, "learning_rate": 3.283559097286486e-07, "logits/chosen": 0.6089428663253784, "logits/rejected": 0.6763657331466675, "logps/chosen": -9.060758590698242, "logps/rejected": -9.848979949951172, "loss": 0.5207, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.060758590698242, "rewards/margins": 0.7882214188575745, "rewards/rejected": -9.848979949951172, "semantic_entropy": 0.002590155927464366, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 18.640006592880273, "learning_rate": 3.268939852432765e-07, "logits/chosen": 0.6610291600227356, "logits/rejected": 0.6989740133285522, "logps/chosen": -9.217550277709961, "logps/rejected": -9.963292121887207, "loss": 0.5338, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.217550277709961, "rewards/margins": 0.7457407712936401, "rewards/rejected": -9.963292121887207, "semantic_entropy": 0.002440792042762041, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 23.51490179151936, "learning_rate": 3.254337402332187e-07, "logits/chosen": 0.7316364049911499, "logits/rejected": 0.7886163592338562, "logps/chosen": -9.14958381652832, "logps/rejected": -10.033146858215332, "loss": 0.5228, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.14958381652832, "rewards/margins": 0.8835636377334595, "rewards/rejected": -10.033146858215332, "semantic_entropy": 0.0022072389256209135, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 21.709681200914343, "learning_rate": 3.239751888657788e-07, "logits/chosen": 0.7047960162162781, "logits/rejected": 0.7564027309417725, "logps/chosen": -9.247949600219727, "logps/rejected": -10.113713264465332, "loss": 0.5189, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.247949600219727, "rewards/margins": 0.8657627105712891, "rewards/rejected": -10.113713264465332, "semantic_entropy": 0.002100490964949131, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 20.917519916910795, "learning_rate": 3.2251834529182856e-07, "logits/chosen": 0.6687744855880737, "logits/rejected": 0.7209922671318054, "logps/chosen": -8.913464546203613, "logps/rejected": -10.03473949432373, "loss": 0.4826, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.913464546203613, "rewards/margins": 1.1212753057479858, "rewards/rejected": -10.03473949432373, "semantic_entropy": 0.0026697556022554636, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 24.85961354517434, "learning_rate": 3.2106322364567075e-07, "logits/chosen": 0.7192034721374512, "logits/rejected": 0.776824951171875, "logps/chosen": -8.97942066192627, "logps/rejected": -10.088353157043457, "loss": 0.4491, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.97942066192627, "rewards/margins": 1.1089332103729248, "rewards/rejected": -10.088353157043457, "semantic_entropy": 0.002531954552978277, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 18.73523000620898, "learning_rate": 3.1960983804490183e-07, "logits/chosen": 0.6787633895874023, "logits/rejected": 0.7581242322921753, "logps/chosen": -9.208142280578613, "logps/rejected": -10.221251487731934, "loss": 0.5371, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.208142280578613, "rewards/margins": 1.0131086111068726, "rewards/rejected": -10.221251487731934, "semantic_entropy": 0.0021549214143306017, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 16.839870791359772, "learning_rate": 3.1815820259027537e-07, "logits/chosen": 0.6967512369155884, "logits/rejected": 0.7626298666000366, "logps/chosen": -8.988113403320312, "logps/rejected": -10.06352710723877, "loss": 0.439, "rewards/accuracies": 0.78125, "rewards/chosen": -8.988113403320312, "rewards/margins": 1.0754133462905884, "rewards/rejected": -10.06352710723877, "semantic_entropy": 0.002239447785541415, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 25.890088694071558, "learning_rate": 3.16708331365565e-07, "logits/chosen": 0.7034773826599121, "logits/rejected": 0.7456248998641968, "logps/chosen": -9.344053268432617, "logps/rejected": -10.413859367370605, "loss": 0.4768, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.344053268432617, "rewards/margins": 1.0698063373565674, "rewards/rejected": -10.413859367370605, "semantic_entropy": 0.0019446806982159615, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 20.622779505918793, "learning_rate": 3.152602384374275e-07, "logits/chosen": 0.77290278673172, "logits/rejected": 0.8412041664123535, "logps/chosen": -9.321812629699707, "logps/rejected": -10.298714637756348, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": -9.321812629699707, "rewards/margins": 0.9769018292427063, "rewards/rejected": -10.298714637756348, "semantic_entropy": 0.0020173420198261738, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 20.085568144464975, "learning_rate": 3.1381393785526697e-07, "logits/chosen": 0.7355653643608093, "logits/rejected": 0.7890772819519043, "logps/chosen": -9.242449760437012, "logps/rejected": -10.24592399597168, "loss": 0.476, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.242449760437012, "rewards/margins": 1.0034732818603516, "rewards/rejected": -10.24592399597168, "semantic_entropy": 0.0019115330651402473, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 19.738688834879806, "learning_rate": 3.123694436510979e-07, "logits/chosen": 0.7584089040756226, "logits/rejected": 0.8401368260383606, "logps/chosen": -9.147109985351562, "logps/rejected": -10.11845874786377, "loss": 0.4871, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.147109985351562, "rewards/margins": 0.9713494181632996, "rewards/rejected": -10.11845874786377, "semantic_entropy": 0.002204468008130789, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 23.899312142624215, "learning_rate": 3.1092676983940946e-07, "logits/chosen": 0.8023883700370789, "logits/rejected": 0.8330841064453125, "logps/chosen": -9.159601211547852, "logps/rejected": -10.233617782592773, "loss": 0.4659, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.159601211547852, "rewards/margins": 1.0740149021148682, "rewards/rejected": -10.233617782592773, "semantic_entropy": 0.0021205353550612926, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 19.69531748298203, "learning_rate": 3.094859304170293e-07, "logits/chosen": 0.8703521490097046, "logits/rejected": 0.9046932458877563, "logps/chosen": -9.088752746582031, "logps/rejected": -10.052389144897461, "loss": 0.514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.088752746582031, "rewards/margins": 0.9636358022689819, "rewards/rejected": -10.052389144897461, "semantic_entropy": 0.0021650404669344425, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 16.58014663580926, "learning_rate": 3.0804693936298795e-07, "logits/chosen": 0.8061652183532715, "logits/rejected": 0.8440017700195312, "logps/chosen": -9.173129081726074, "logps/rejected": -10.378541946411133, "loss": 0.4549, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.173129081726074, "rewards/margins": 1.2054128646850586, "rewards/rejected": -10.378541946411133, "semantic_entropy": 0.002410900080576539, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 19.2115007374387, "learning_rate": 3.066098106383826e-07, "logits/chosen": 0.7924807071685791, "logits/rejected": 0.8539689183235168, "logps/chosen": -9.085798263549805, "logps/rejected": -10.019353866577148, "loss": 0.4867, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.085798263549805, "rewards/margins": 0.9335559010505676, "rewards/rejected": -10.019353866577148, "semantic_entropy": 0.0022169214207679033, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 15.73085596537741, "learning_rate": 3.0517455818624263e-07, "logits/chosen": 0.728915810585022, "logits/rejected": 0.7807096838951111, "logps/chosen": -9.136758804321289, "logps/rejected": -10.273028373718262, "loss": 0.4235, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.136758804321289, "rewards/margins": 1.1362701654434204, "rewards/rejected": -10.273028373718262, "semantic_entropy": 0.0019926291424781084, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 21.48912266471965, "learning_rate": 3.037411959313936e-07, "logits/chosen": 0.8049052357673645, "logits/rejected": 0.8560088872909546, "logps/chosen": -9.200352668762207, "logps/rejected": -10.1701078414917, "loss": 0.4887, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.200352668762207, "rewards/margins": 0.9697545766830444, "rewards/rejected": -10.1701078414917, "semantic_entropy": 0.0019432473927736282, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 29.255894045857996, "learning_rate": 3.023097377803224e-07, "logits/chosen": 0.8145462870597839, "logits/rejected": 0.8601492047309875, "logps/chosen": -9.245951652526855, "logps/rejected": -10.164546012878418, "loss": 0.5493, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.245951652526855, "rewards/margins": 0.9185951352119446, "rewards/rejected": -10.164546012878418, "semantic_entropy": 0.0018242119112983346, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 20.524507990126196, "learning_rate": 3.008801976210423e-07, "logits/chosen": 0.8181111216545105, "logits/rejected": 0.8561455607414246, "logps/chosen": -9.23983383178711, "logps/rejected": -10.13349723815918, "loss": 0.4809, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.23983383178711, "rewards/margins": 0.8936625719070435, "rewards/rejected": -10.13349723815918, "semantic_entropy": 0.002180408453568816, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 17.0373176165906, "learning_rate": 2.994525893229581e-07, "logits/chosen": 0.8141145706176758, "logits/rejected": 0.8517176508903503, "logps/chosen": -9.19636344909668, "logps/rejected": -10.442273139953613, "loss": 0.3877, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.19636344909668, "rewards/margins": 1.2459100484848022, "rewards/rejected": -10.442273139953613, "semantic_entropy": 0.002024973975494504, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 15.697982397516139, "learning_rate": 2.98026926736732e-07, "logits/chosen": 0.7669566869735718, "logits/rejected": 0.811779797077179, "logps/chosen": -8.987443923950195, "logps/rejected": -10.217406272888184, "loss": 0.4174, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.987443923950195, "rewards/margins": 1.2299631834030151, "rewards/rejected": -10.217406272888184, "semantic_entropy": 0.0029276900459080935, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 14.632163572843812, "learning_rate": 2.9660322369414846e-07, "logits/chosen": 0.8088932037353516, "logits/rejected": 0.8847886323928833, "logps/chosen": -9.229738235473633, "logps/rejected": -10.462305068969727, "loss": 0.4008, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.229738235473633, "rewards/margins": 1.2325657606124878, "rewards/rejected": -10.462305068969727, "semantic_entropy": 0.0019744504243135452, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 13.938257971384212, "learning_rate": 2.9518149400798063e-07, "logits/chosen": 0.7997492551803589, "logits/rejected": 0.8458935022354126, "logps/chosen": -9.3661470413208, "logps/rejected": -10.73242473602295, "loss": 0.4011, "rewards/accuracies": 0.8125, "rewards/chosen": -9.3661470413208, "rewards/margins": 1.3662781715393066, "rewards/rejected": -10.73242473602295, "semantic_entropy": 0.0021405040752142668, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 21.92333227216617, "learning_rate": 2.9376175147185633e-07, "logits/chosen": 0.7903780937194824, "logits/rejected": 0.8987275958061218, "logps/chosen": -9.55534553527832, "logps/rejected": -10.687799453735352, "loss": 0.4631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.55534553527832, "rewards/margins": 1.132454514503479, "rewards/rejected": -10.687799453735352, "semantic_entropy": 0.0018273256719112396, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 21.21705630664952, "learning_rate": 2.9234400986012376e-07, "logits/chosen": 0.7476860284805298, "logits/rejected": 0.8315266370773315, "logps/chosen": -9.206350326538086, "logps/rejected": -10.599584579467773, "loss": 0.3865, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.206350326538086, "rewards/margins": 1.3932336568832397, "rewards/rejected": -10.599584579467773, "semantic_entropy": 0.002457220805808902, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 23.717702676896593, "learning_rate": 2.9092828292771817e-07, "logits/chosen": 0.8404645919799805, "logits/rejected": 0.8653911352157593, "logps/chosen": -9.488239288330078, "logps/rejected": -10.67754077911377, "loss": 0.4216, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.488239288330078, "rewards/margins": 1.1893017292022705, "rewards/rejected": -10.67754077911377, "semantic_entropy": 0.001736976788379252, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 18.026440727909602, "learning_rate": 2.8951458441002875e-07, "logits/chosen": 0.7588644027709961, "logits/rejected": 0.804205596446991, "logps/chosen": -9.2258882522583, "logps/rejected": -10.435041427612305, "loss": 0.4317, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.2258882522583, "rewards/margins": 1.2091554403305054, "rewards/rejected": -10.435041427612305, "semantic_entropy": 0.0019264190923422575, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 17.35712096542329, "learning_rate": 2.881029280227643e-07, "logits/chosen": 0.7276099324226379, "logits/rejected": 0.8241073489189148, "logps/chosen": -9.210673332214355, "logps/rejected": -10.420351028442383, "loss": 0.443, "rewards/accuracies": 0.78125, "rewards/chosen": -9.210673332214355, "rewards/margins": 1.2096776962280273, "rewards/rejected": -10.420351028442383, "semantic_entropy": 0.0021508794743567705, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 13.520740192725968, "learning_rate": 2.8669332746182177e-07, "logits/chosen": 0.6945358514785767, "logits/rejected": 0.7733569741249084, "logps/chosen": -9.187652587890625, "logps/rejected": -10.517842292785645, "loss": 0.3952, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.187652587890625, "rewards/margins": 1.3301887512207031, "rewards/rejected": -10.517842292785645, "semantic_entropy": 0.0021430773194879293, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 20.76831890859162, "learning_rate": 2.8528579640315156e-07, "logits/chosen": 0.7405019998550415, "logits/rejected": 0.7900283336639404, "logps/chosen": -9.107033729553223, "logps/rejected": -10.247450828552246, "loss": 0.4329, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.107033729553223, "rewards/margins": 1.140415906906128, "rewards/rejected": -10.247450828552246, "semantic_entropy": 0.0022071374114602804, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 20.516974992601128, "learning_rate": 2.8388034850262646e-07, "logits/chosen": 0.7259657382965088, "logits/rejected": 0.807550311088562, "logps/chosen": -9.008337020874023, "logps/rejected": -10.242483139038086, "loss": 0.4152, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.008337020874023, "rewards/margins": 1.2341454029083252, "rewards/rejected": -10.242483139038086, "semantic_entropy": 0.0025359108112752438, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 24.572088168328634, "learning_rate": 2.824769973959079e-07, "logits/chosen": 0.7538091540336609, "logits/rejected": 0.8418930172920227, "logps/chosen": -9.286577224731445, "logps/rejected": -10.427810668945312, "loss": 0.4257, "rewards/accuracies": 0.8125, "rewards/chosen": -9.286577224731445, "rewards/margins": 1.1412330865859985, "rewards/rejected": -10.427810668945312, "semantic_entropy": 0.0017364490777254105, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 18.5404432038831, "learning_rate": 2.81075756698315e-07, "logits/chosen": 0.7726496458053589, "logits/rejected": 0.855174720287323, "logps/chosen": -9.180562019348145, "logps/rejected": -10.476685523986816, "loss": 0.3802, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.180562019348145, "rewards/margins": 1.29612398147583, "rewards/rejected": -10.476685523986816, "semantic_entropy": 0.0018982533365488052, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 18.915627570533157, "learning_rate": 2.7967664000469035e-07, "logits/chosen": 0.721420407295227, "logits/rejected": 0.7838973999023438, "logps/chosen": -9.233001708984375, "logps/rejected": -10.553727149963379, "loss": 0.3627, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.233001708984375, "rewards/margins": 1.320725679397583, "rewards/rejected": -10.553727149963379, "semantic_entropy": 0.0020444700494408607, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 18.14358020288826, "learning_rate": 2.7827966088927095e-07, "logits/chosen": 0.6938169598579407, "logits/rejected": 0.8017823100090027, "logps/chosen": -9.465526580810547, "logps/rejected": -10.78095817565918, "loss": 0.3999, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.465526580810547, "rewards/margins": 1.315431833267212, "rewards/rejected": -10.78095817565918, "semantic_entropy": 0.001522608334198594, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 17.57535140312262, "learning_rate": 2.768848329055538e-07, "logits/chosen": 0.7879313230514526, "logits/rejected": 0.8248831629753113, "logps/chosen": -9.305051803588867, "logps/rejected": -10.57546615600586, "loss": 0.3852, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.305051803588867, "rewards/margins": 1.2704143524169922, "rewards/rejected": -10.57546615600586, "semantic_entropy": 0.0018570246174931526, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 20.119447880874766, "learning_rate": 2.7549216958616657e-07, "logits/chosen": 0.7350586652755737, "logits/rejected": 0.8105939030647278, "logps/chosen": -9.496885299682617, "logps/rejected": -10.835896492004395, "loss": 0.3968, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.496885299682617, "rewards/margins": 1.33901047706604, "rewards/rejected": -10.835896492004395, "semantic_entropy": 0.0016353337559849024, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 15.071984684362736, "learning_rate": 2.741016844427344e-07, "logits/chosen": 0.7667199969291687, "logits/rejected": 0.8489478826522827, "logps/chosen": -9.403945922851562, "logps/rejected": -10.778000831604004, "loss": 0.3713, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.403945922851562, "rewards/margins": 1.3740556240081787, "rewards/rejected": -10.778000831604004, "semantic_entropy": 0.001894004992209375, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 17.801746580163428, "learning_rate": 2.7271339096575073e-07, "logits/chosen": 0.7659896612167358, "logits/rejected": 0.8438106775283813, "logps/chosen": -9.306072235107422, "logps/rejected": -10.509511947631836, "loss": 0.4364, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.306072235107422, "rewards/margins": 1.2034391164779663, "rewards/rejected": -10.509511947631836, "semantic_entropy": 0.00206328509375453, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 16.02571773917183, "learning_rate": 2.713273026244446e-07, "logits/chosen": 0.7731425166130066, "logits/rejected": 0.860715389251709, "logps/chosen": -9.523012161254883, "logps/rejected": -10.883203506469727, "loss": 0.3804, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.523012161254883, "rewards/margins": 1.3601921796798706, "rewards/rejected": -10.883203506469727, "semantic_entropy": 0.001517820986919105, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 17.59552798096768, "learning_rate": 2.6994343286665156e-07, "logits/chosen": 0.7494341731071472, "logits/rejected": 0.8325299024581909, "logps/chosen": -9.523847579956055, "logps/rejected": -10.638973236083984, "loss": 0.4481, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.523847579956055, "rewards/margins": 1.1151244640350342, "rewards/rejected": -10.638973236083984, "semantic_entropy": 0.0015663004014641047, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 21.242971901873577, "learning_rate": 2.6856179511868156e-07, "logits/chosen": 0.7531827092170715, "logits/rejected": 0.8257854580879211, "logps/chosen": -9.41790771484375, "logps/rejected": -10.846906661987305, "loss": 0.428, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.41790771484375, "rewards/margins": 1.428999423980713, "rewards/rejected": -10.846906661987305, "semantic_entropy": 0.001963115995749831, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 23.548898424244427, "learning_rate": 2.6718240278519056e-07, "logits/chosen": 0.7559301853179932, "logits/rejected": 0.8003666996955872, "logps/chosen": -9.45875072479248, "logps/rejected": -10.853038787841797, "loss": 0.4169, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.45875072479248, "rewards/margins": 1.3942878246307373, "rewards/rejected": -10.853038787841797, "semantic_entropy": 0.0015923971077427268, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 20.46304047002647, "learning_rate": 2.6580526924904866e-07, "logits/chosen": 0.6976224780082703, "logits/rejected": 0.7671376466751099, "logps/chosen": -9.384844779968262, "logps/rejected": -10.741189956665039, "loss": 0.3793, "rewards/accuracies": 0.84375, "rewards/chosen": -9.384844779968262, "rewards/margins": 1.3563454151153564, "rewards/rejected": -10.741189956665039, "semantic_entropy": 0.001644113683141768, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 24.88011754633229, "learning_rate": 2.6443040787121186e-07, "logits/chosen": 0.6563600301742554, "logits/rejected": 0.6912602782249451, "logps/chosen": -9.349275588989258, "logps/rejected": -10.556272506713867, "loss": 0.4195, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.349275588989258, "rewards/margins": 1.206997036933899, "rewards/rejected": -10.556272506713867, "semantic_entropy": 0.0016969643766060472, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 24.129234255509385, "learning_rate": 2.6305783199059084e-07, "logits/chosen": 0.7805946469306946, "logits/rejected": 0.8489789962768555, "logps/chosen": -9.523481369018555, "logps/rejected": -10.783955574035645, "loss": 0.4536, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.523481369018555, "rewards/margins": 1.2604728937149048, "rewards/rejected": -10.783955574035645, "semantic_entropy": 0.001705177710391581, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 20.770855194305607, "learning_rate": 2.6168755492392324e-07, "logits/chosen": 0.7947415113449097, "logits/rejected": 0.8732272982597351, "logps/chosen": -9.23397159576416, "logps/rejected": -10.698512077331543, "loss": 0.3445, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.23397159576416, "rewards/margins": 1.4645414352416992, "rewards/rejected": -10.698512077331543, "semantic_entropy": 0.0018425941234454513, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 19.657773873554152, "learning_rate": 2.6031958996564274e-07, "logits/chosen": 0.7870718240737915, "logits/rejected": 0.8257455825805664, "logps/chosen": -9.316213607788086, "logps/rejected": -10.864585876464844, "loss": 0.3707, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.316213607788086, "rewards/margins": 1.5483721494674683, "rewards/rejected": -10.864585876464844, "semantic_entropy": 0.001938262372277677, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 29.398133134663638, "learning_rate": 2.589539503877518e-07, "logits/chosen": 0.7874363660812378, "logits/rejected": 0.8331443667411804, "logps/chosen": -9.478517532348633, "logps/rejected": -10.828798294067383, "loss": 0.4304, "rewards/accuracies": 0.8125, "rewards/chosen": -9.478517532348633, "rewards/margins": 1.3502806425094604, "rewards/rejected": -10.828798294067383, "semantic_entropy": 0.0018109595403075218, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 17.54106321474661, "learning_rate": 2.5759064943969125e-07, "logits/chosen": 0.7402059435844421, "logits/rejected": 0.8292325735092163, "logps/chosen": -9.550467491149902, "logps/rejected": -10.937161445617676, "loss": 0.3934, "rewards/accuracies": 0.8125, "rewards/chosen": -9.550467491149902, "rewards/margins": 1.3866939544677734, "rewards/rejected": -10.937161445617676, "semantic_entropy": 0.0017161194700747728, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 17.162272687965423, "learning_rate": 2.562297003482131e-07, "logits/chosen": 0.800572395324707, "logits/rejected": 0.8477448225021362, "logps/chosen": -9.5453462600708, "logps/rejected": -10.9487943649292, "loss": 0.3582, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.5453462600708, "rewards/margins": 1.4034483432769775, "rewards/rejected": -10.9487943649292, "semantic_entropy": 0.0017864892724901438, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 18.16669999802219, "learning_rate": 2.548711163172512e-07, "logits/chosen": 0.7785830497741699, "logits/rejected": 0.8476356267929077, "logps/chosen": -9.759795188903809, "logps/rejected": -11.034205436706543, "loss": 0.4168, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.759795188903809, "rewards/margins": 1.2744102478027344, "rewards/rejected": -11.034205436706543, "semantic_entropy": 0.0020329877734184265, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 21.502409164431704, "learning_rate": 2.53514910527794e-07, "logits/chosen": 0.8269200325012207, "logits/rejected": 0.8657910227775574, "logps/chosen": -9.461301803588867, "logps/rejected": -10.762018203735352, "loss": 0.3961, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.461301803588867, "rewards/margins": 1.300715446472168, "rewards/rejected": -10.762018203735352, "semantic_entropy": 0.0016499152407050133, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 22.127771721745866, "learning_rate": 2.5216109613775573e-07, "logits/chosen": 0.7920838594436646, "logits/rejected": 0.8535796403884888, "logps/chosen": -9.840039253234863, "logps/rejected": -11.020828247070312, "loss": 0.4512, "rewards/accuracies": 0.78125, "rewards/chosen": -9.840039253234863, "rewards/margins": 1.1807891130447388, "rewards/rejected": -11.020828247070312, "semantic_entropy": 0.0013382106553763151, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 21.03906625443135, "learning_rate": 2.5080968628184993e-07, "logits/chosen": 0.7727931141853333, "logits/rejected": 0.8665952682495117, "logps/chosen": -9.525362968444824, "logps/rejected": -11.037253379821777, "loss": 0.3669, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.525362968444824, "rewards/margins": 1.5118907690048218, "rewards/rejected": -11.037253379821777, "semantic_entropy": 0.0015195768792182207, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 17.048956060445978, "learning_rate": 2.494606940714605e-07, "logits/chosen": 0.7970033884048462, "logits/rejected": 0.8288514018058777, "logps/chosen": -9.431905746459961, "logps/rejected": -10.85214900970459, "loss": 0.3827, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.431905746459961, "rewards/margins": 1.4202440977096558, "rewards/rejected": -10.85214900970459, "semantic_entropy": 0.001813689828850329, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 15.3880445844123, "learning_rate": 2.4811413259451625e-07, "logits/chosen": 0.7811511158943176, "logits/rejected": 0.860381007194519, "logps/chosen": -9.466936111450195, "logps/rejected": -10.93088436126709, "loss": 0.376, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.466936111450195, "rewards/margins": 1.4639488458633423, "rewards/rejected": -10.93088436126709, "semantic_entropy": 0.0018378589302301407, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 15.913978912005007, "learning_rate": 2.46770014915362e-07, "logits/chosen": 0.7653626203536987, "logits/rejected": 0.8481870889663696, "logps/chosen": -9.565362930297852, "logps/rejected": -10.939167022705078, "loss": 0.3977, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.565362930297852, "rewards/margins": 1.3738042116165161, "rewards/rejected": -10.939167022705078, "semantic_entropy": 0.0015822149580344558, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 27.05791308561908, "learning_rate": 2.45428354074634e-07, "logits/chosen": 0.7272459268569946, "logits/rejected": 0.768274188041687, "logps/chosen": -9.656865119934082, "logps/rejected": -10.986165046691895, "loss": 0.4415, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.656865119934082, "rewards/margins": 1.3292994499206543, "rewards/rejected": -10.986165046691895, "semantic_entropy": 0.0016760114813223481, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 24.89803710573063, "learning_rate": 2.4408916308913105e-07, "logits/chosen": 0.7583307027816772, "logits/rejected": 0.8247106671333313, "logps/chosen": -9.691811561584473, "logps/rejected": -10.66722297668457, "loss": 0.4955, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.691811561584473, "rewards/margins": 0.9754101634025574, "rewards/rejected": -10.66722297668457, "semantic_entropy": 0.0014091429766267538, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 27.703608412459293, "learning_rate": 2.4275245495169025e-07, "logits/chosen": 0.8197698593139648, "logits/rejected": 0.9059907793998718, "logps/chosen": -9.506436347961426, "logps/rejected": -10.890914916992188, "loss": 0.4066, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.506436347961426, "rewards/margins": 1.3844783306121826, "rewards/rejected": -10.890914916992188, "semantic_entropy": 0.0022954349406063557, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 23.22557014778233, "learning_rate": 2.414182426310597e-07, "logits/chosen": 0.758955180644989, "logits/rejected": 0.8061805963516235, "logps/chosen": -9.516363143920898, "logps/rejected": -10.984567642211914, "loss": 0.4001, "rewards/accuracies": 0.84375, "rewards/chosen": -9.516363143920898, "rewards/margins": 1.4682044982910156, "rewards/rejected": -10.984567642211914, "semantic_entropy": 0.0018032476073130965, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 13.701261098953237, "learning_rate": 2.400865390717734e-07, "logits/chosen": 0.7926728129386902, "logits/rejected": 0.8731076121330261, "logps/chosen": -9.494871139526367, "logps/rejected": -11.167115211486816, "loss": 0.3389, "rewards/accuracies": 0.875, "rewards/chosen": -9.494871139526367, "rewards/margins": 1.6722424030303955, "rewards/rejected": -11.167115211486816, "semantic_entropy": 0.0018036758992820978, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 20.712815017204772, "learning_rate": 2.3875735719402475e-07, "logits/chosen": 0.7888078093528748, "logits/rejected": 0.8680068850517273, "logps/chosen": -9.749283790588379, "logps/rejected": -11.215142250061035, "loss": 0.3831, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.749283790588379, "rewards/margins": 1.4658589363098145, "rewards/rejected": -11.215142250061035, "semantic_entropy": 0.0016434881836175919, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 20.343949879193467, "learning_rate": 2.3743070989354258e-07, "logits/chosen": 0.8662029504776001, "logits/rejected": 0.9218491315841675, "logps/chosen": -9.647726058959961, "logps/rejected": -11.065472602844238, "loss": 0.4375, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.647726058959961, "rewards/margins": 1.4177464246749878, "rewards/rejected": -11.065472602844238, "semantic_entropy": 0.001984253991395235, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 24.494673317414936, "learning_rate": 2.3610661004146454e-07, "logits/chosen": 0.8529459834098816, "logits/rejected": 0.9132159352302551, "logps/chosen": -9.46276569366455, "logps/rejected": -10.809822082519531, "loss": 0.3669, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -9.46276569366455, "rewards/margins": 1.3470571041107178, "rewards/rejected": -10.809822082519531, "semantic_entropy": 0.0018992737168446183, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 19.20248865933138, "learning_rate": 2.3478507048421314e-07, "logits/chosen": 0.8148723840713501, "logits/rejected": 0.8485409617424011, "logps/chosen": -9.510710716247559, "logps/rejected": -11.025351524353027, "loss": 0.3994, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.510710716247559, "rewards/margins": 1.5146404504776, "rewards/rejected": -11.025351524353027, "semantic_entropy": 0.0019136825576424599, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 28.472850279807695, "learning_rate": 2.334661040433713e-07, "logits/chosen": 0.7736892700195312, "logits/rejected": 0.8397472500801086, "logps/chosen": -9.476969718933105, "logps/rejected": -10.861968040466309, "loss": 0.3791, "rewards/accuracies": 0.84375, "rewards/chosen": -9.476969718933105, "rewards/margins": 1.3849985599517822, "rewards/rejected": -10.861968040466309, "semantic_entropy": 0.0017747702077031136, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 19.047738461075404, "learning_rate": 2.321497235155568e-07, "logits/chosen": 0.7408386468887329, "logits/rejected": 0.7888758778572083, "logps/chosen": -9.377801895141602, "logps/rejected": -10.882084846496582, "loss": 0.3476, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -9.377801895141602, "rewards/margins": 1.5042815208435059, "rewards/rejected": -10.882084846496582, "semantic_entropy": 0.00191340665332973, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 28.183809111461034, "learning_rate": 2.3083594167229965e-07, "logits/chosen": 0.748810887336731, "logits/rejected": 0.8732229471206665, "logps/chosen": -9.67158317565918, "logps/rejected": -11.065845489501953, "loss": 0.4378, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.67158317565918, "rewards/margins": 1.3942630290985107, "rewards/rejected": -11.065845489501953, "semantic_entropy": 0.00166232546325773, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 25.264626186096123, "learning_rate": 2.295247712599167e-07, "logits/chosen": 0.806961178779602, "logits/rejected": 0.8473097681999207, "logps/chosen": -9.56501579284668, "logps/rejected": -10.97465705871582, "loss": 0.3953, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.56501579284668, "rewards/margins": 1.4096405506134033, "rewards/rejected": -10.97465705871582, "semantic_entropy": 0.0016302301082760096, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.9285687804222107, "eval_logits/rejected": 0.978207528591156, "eval_logps/chosen": -9.8406982421875, "eval_logps/rejected": -10.940929412841797, "eval_loss": 0.526120126247406, "eval_rewards/accuracies": 0.719584584236145, "eval_rewards/chosen": -9.8406982421875, "eval_rewards/margins": 1.100231647491455, "eval_rewards/rejected": -10.940929412841797, "eval_runtime": 35.0954, "eval_samples_per_second": 38.324, "eval_semantic_entropy": 0.001473370473831892, "eval_steps_per_second": 9.602, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 23.667192452025375, "learning_rate": 2.2821622499938948e-07, "logits/chosen": 0.8243509531021118, "logits/rejected": 0.9055337905883789, "logps/chosen": -9.871681213378906, "logps/rejected": -11.096675872802734, "loss": 0.4597, "rewards/accuracies": 0.78125, "rewards/chosen": -9.871681213378906, "rewards/margins": 1.224994421005249, "rewards/rejected": -11.096675872802734, "semantic_entropy": 0.0016837811563163996, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 27.290675701525487, "learning_rate": 2.269103155862391e-07, "logits/chosen": 0.76947021484375, "logits/rejected": 0.8418231010437012, "logps/chosen": -9.789541244506836, "logps/rejected": -10.954092979431152, "loss": 0.4684, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.789541244506836, "rewards/margins": 1.1645511388778687, "rewards/rejected": -10.954092979431152, "semantic_entropy": 0.0013261919375509024, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 22.016730657163862, "learning_rate": 2.2560705569040483e-07, "logits/chosen": 0.7831665873527527, "logits/rejected": 0.8772950172424316, "logps/chosen": -9.797338485717773, "logps/rejected": -11.046935081481934, "loss": 0.4435, "rewards/accuracies": 0.78125, "rewards/chosen": -9.797338485717773, "rewards/margins": 1.249597191810608, "rewards/rejected": -11.046935081481934, "semantic_entropy": 0.0014855300541967154, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 18.024857287329926, "learning_rate": 2.2430645795611963e-07, "logits/chosen": 0.7459646463394165, "logits/rejected": 0.8185451626777649, "logps/chosen": -9.737576484680176, "logps/rejected": -11.164031982421875, "loss": 0.3774, "rewards/accuracies": 0.8125, "rewards/chosen": -9.737576484680176, "rewards/margins": 1.4264552593231201, "rewards/rejected": -11.164031982421875, "semantic_entropy": 0.0017724098870530725, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 26.64615086075229, "learning_rate": 2.230085350017884e-07, "logits/chosen": 0.8524463772773743, "logits/rejected": 0.8867918848991394, "logps/chosen": -9.587358474731445, "logps/rejected": -10.673149108886719, "loss": 0.4612, "rewards/accuracies": 0.78125, "rewards/chosen": -9.587358474731445, "rewards/margins": 1.0857917070388794, "rewards/rejected": -10.673149108886719, "semantic_entropy": 0.0018163727363571525, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 16.629223181883795, "learning_rate": 2.2171329941986554e-07, "logits/chosen": 0.7545696496963501, "logits/rejected": 0.8315639495849609, "logps/chosen": -9.440114974975586, "logps/rejected": -10.97568130493164, "loss": 0.35, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.440114974975586, "rewards/margins": 1.5355665683746338, "rewards/rejected": -10.97568130493164, "semantic_entropy": 0.0017560431733727455, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 16.933778427501522, "learning_rate": 2.2042076377673202e-07, "logits/chosen": 0.7635836005210876, "logits/rejected": 0.7833055853843689, "logps/chosen": -9.408586502075195, "logps/rejected": -10.607271194458008, "loss": 0.4138, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.408586502075195, "rewards/margins": 1.1986857652664185, "rewards/rejected": -10.607271194458008, "semantic_entropy": 0.0017117311945185065, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 23.28336770400718, "learning_rate": 2.1913094061257476e-07, "logits/chosen": 0.8093854784965515, "logits/rejected": 0.8086700439453125, "logps/chosen": -9.568441390991211, "logps/rejected": -10.82945442199707, "loss": 0.4363, "rewards/accuracies": 0.78125, "rewards/chosen": -9.568441390991211, "rewards/margins": 1.2610145807266235, "rewards/rejected": -10.82945442199707, "semantic_entropy": 0.0016031649429351091, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 21.12058624108258, "learning_rate": 2.178438424412633e-07, "logits/chosen": 0.8389409780502319, "logits/rejected": 0.8938524127006531, "logps/chosen": -9.587557792663574, "logps/rejected": -10.771153450012207, "loss": 0.4503, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.587557792663574, "rewards/margins": 1.1835949420928955, "rewards/rejected": -10.771153450012207, "semantic_entropy": 0.0016682265559211373, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 31.758164864561788, "learning_rate": 2.165594817502302e-07, "logits/chosen": 0.8181732892990112, "logits/rejected": 0.8720429539680481, "logps/chosen": -9.709211349487305, "logps/rejected": -10.695291519165039, "loss": 0.5042, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.709211349487305, "rewards/margins": 0.9860790371894836, "rewards/rejected": -10.695291519165039, "semantic_entropy": 0.0015186185482889414, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 22.613433139861957, "learning_rate": 2.1527787100034806e-07, "logits/chosen": 0.8588092923164368, "logits/rejected": 0.8922932744026184, "logps/chosen": -9.441000938415527, "logps/rejected": -10.503214836120605, "loss": 0.4521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.441000938415527, "rewards/margins": 1.062213659286499, "rewards/rejected": -10.503214836120605, "semantic_entropy": 0.0017032899195328355, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 18.889543716610927, "learning_rate": 2.1399902262581037e-07, "logits/chosen": 0.9263399243354797, "logits/rejected": 0.967937171459198, "logps/chosen": -9.547457695007324, "logps/rejected": -10.7252836227417, "loss": 0.4549, "rewards/accuracies": 0.78125, "rewards/chosen": -9.547457695007324, "rewards/margins": 1.177826166152954, "rewards/rejected": -10.7252836227417, "semantic_entropy": 0.0017254750709980726, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 20.68930586575862, "learning_rate": 2.127229490340094e-07, "logits/chosen": 0.7730456590652466, "logits/rejected": 0.801898181438446, "logps/chosen": -9.515592575073242, "logps/rejected": -10.984514236450195, "loss": 0.3715, "rewards/accuracies": 0.84375, "rewards/chosen": -9.515592575073242, "rewards/margins": 1.4689228534698486, "rewards/rejected": -10.984514236450195, "semantic_entropy": 0.0015030469512566924, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 24.806502277573657, "learning_rate": 2.1144966260541698e-07, "logits/chosen": 0.8492151498794556, "logits/rejected": 0.9006759524345398, "logps/chosen": -9.50521183013916, "logps/rejected": -11.008363723754883, "loss": 0.4001, "rewards/accuracies": 0.8125, "rewards/chosen": -9.50521183013916, "rewards/margins": 1.5031511783599854, "rewards/rejected": -11.008363723754883, "semantic_entropy": 0.0017372198635712266, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 19.324917536101047, "learning_rate": 2.1017917569346332e-07, "logits/chosen": 0.814143180847168, "logits/rejected": 0.8865806460380554, "logps/chosen": -9.452108383178711, "logps/rejected": -10.87096118927002, "loss": 0.3736, "rewards/accuracies": 0.8125, "rewards/chosen": -9.452108383178711, "rewards/margins": 1.4188525676727295, "rewards/rejected": -10.87096118927002, "semantic_entropy": 0.0015151125844568014, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 18.81790999844241, "learning_rate": 2.0891150062441837e-07, "logits/chosen": 0.7656540870666504, "logits/rejected": 0.8226898312568665, "logps/chosen": -9.545916557312012, "logps/rejected": -10.923840522766113, "loss": 0.4004, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.545916557312012, "rewards/margins": 1.3779232501983643, "rewards/rejected": -10.923840522766113, "semantic_entropy": 0.0021084630861878395, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 21.06645650645646, "learning_rate": 2.0764664969727086e-07, "logits/chosen": 0.8369568586349487, "logits/rejected": 0.9141233563423157, "logps/chosen": -9.500614166259766, "logps/rejected": -10.795511245727539, "loss": 0.3764, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.500614166259766, "rewards/margins": 1.2948954105377197, "rewards/rejected": -10.795511245727539, "semantic_entropy": 0.0015911769587546587, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 21.33908508690175, "learning_rate": 2.0638463518361033e-07, "logits/chosen": 0.7690576314926147, "logits/rejected": 0.8658930063247681, "logps/chosen": -9.389853477478027, "logps/rejected": -10.788896560668945, "loss": 0.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.389853477478027, "rewards/margins": 1.3990432024002075, "rewards/rejected": -10.788896560668945, "semantic_entropy": 0.002215514425188303, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 22.867041410821965, "learning_rate": 2.0512546932750702e-07, "logits/chosen": 0.7939780950546265, "logits/rejected": 0.8460676074028015, "logps/chosen": -9.5419282913208, "logps/rejected": -10.808481216430664, "loss": 0.3914, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.5419282913208, "rewards/margins": 1.2665529251098633, "rewards/rejected": -10.808481216430664, "semantic_entropy": 0.0017644502222537994, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 23.636550449380298, "learning_rate": 2.0386916434539343e-07, "logits/chosen": 0.8169394731521606, "logits/rejected": 0.8834837675094604, "logps/chosen": -9.410249710083008, "logps/rejected": -10.827165603637695, "loss": 0.3956, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.410249710083008, "rewards/margins": 1.416915774345398, "rewards/rejected": -10.827165603637695, "semantic_entropy": 0.001839539734646678, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 23.012604021963845, "learning_rate": 2.0261573242594627e-07, "logits/chosen": 0.853449821472168, "logits/rejected": 0.9539750218391418, "logps/chosen": -9.770658493041992, "logps/rejected": -11.06202507019043, "loss": 0.4337, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.770658493041992, "rewards/margins": 1.2913668155670166, "rewards/rejected": -11.06202507019043, "semantic_entropy": 0.0014862673124298453, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 25.51169802963628, "learning_rate": 2.0136518572996724e-07, "logits/chosen": 0.7688170671463013, "logits/rejected": 0.8904584646224976, "logps/chosen": -9.553049087524414, "logps/rejected": -11.032699584960938, "loss": 0.3851, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.553049087524414, "rewards/margins": 1.4796515703201294, "rewards/rejected": -11.032699584960938, "semantic_entropy": 0.0020472349133342505, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 23.510129434710546, "learning_rate": 2.0011753639026617e-07, "logits/chosen": 0.7789877653121948, "logits/rejected": 0.8432193994522095, "logps/chosen": -9.628904342651367, "logps/rejected": -10.928987503051758, "loss": 0.4125, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.628904342651367, "rewards/margins": 1.3000822067260742, "rewards/rejected": -10.928987503051758, "semantic_entropy": 0.001494646305218339, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 25.695966110067108, "learning_rate": 1.988727965115421e-07, "logits/chosen": 0.8373724222183228, "logits/rejected": 0.8486580848693848, "logps/chosen": -9.40330696105957, "logps/rejected": -10.690084457397461, "loss": 0.4098, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.40330696105957, "rewards/margins": 1.2867774963378906, "rewards/rejected": -10.690084457397461, "semantic_entropy": 0.0017762102652341127, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 17.76197982327494, "learning_rate": 1.9763097817026713e-07, "logits/chosen": 0.7693505883216858, "logits/rejected": 0.8610559701919556, "logps/chosen": -9.42241382598877, "logps/rejected": -11.041936874389648, "loss": 0.3312, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -9.42241382598877, "rewards/margins": 1.619523286819458, "rewards/rejected": -11.041936874389648, "semantic_entropy": 0.0017702898476272821, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 17.95604315957035, "learning_rate": 1.9639209341456796e-07, "logits/chosen": 0.8016265630722046, "logits/rejected": 0.8611480593681335, "logps/chosen": -9.537622451782227, "logps/rejected": -10.907397270202637, "loss": 0.4239, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.537622451782227, "rewards/margins": 1.3697750568389893, "rewards/rejected": -10.907397270202637, "semantic_entropy": 0.001632682979106903, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 16.93384833519254, "learning_rate": 1.951561542641102e-07, "logits/chosen": 0.8083820343017578, "logits/rejected": 0.8523054122924805, "logps/chosen": -9.715473175048828, "logps/rejected": -11.148602485656738, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.715473175048828, "rewards/margins": 1.4331295490264893, "rewards/rejected": -11.148602485656738, "semantic_entropy": 0.0015578053425997496, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 18.601513607697683, "learning_rate": 1.939231727099806e-07, "logits/chosen": 0.7638577818870544, "logits/rejected": 0.7983411550521851, "logps/chosen": -9.586159706115723, "logps/rejected": -10.826416015625, "loss": 0.4351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.586159706115723, "rewards/margins": 1.2402559518814087, "rewards/rejected": -10.826416015625, "semantic_entropy": 0.001649503014050424, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 24.016266301167416, "learning_rate": 1.926931607145719e-07, "logits/chosen": 0.8312221765518188, "logits/rejected": 0.8952839970588684, "logps/chosen": -9.727499008178711, "logps/rejected": -11.005497932434082, "loss": 0.4144, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.727499008178711, "rewards/margins": 1.2779988050460815, "rewards/rejected": -11.005497932434082, "semantic_entropy": 0.0015145648503676057, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 20.566896327558037, "learning_rate": 1.9146613021146564e-07, "logits/chosen": 0.8225449323654175, "logits/rejected": 0.8508152961730957, "logps/chosen": -9.408650398254395, "logps/rejected": -10.707399368286133, "loss": 0.4173, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.408650398254395, "rewards/margins": 1.2987501621246338, "rewards/rejected": -10.707399368286133, "semantic_entropy": 0.0019972771406173706, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 22.344243213376547, "learning_rate": 1.9024209310531736e-07, "logits/chosen": 0.847356915473938, "logits/rejected": 0.8640506863594055, "logps/chosen": -9.579663276672363, "logps/rejected": -10.968865394592285, "loss": 0.4137, "rewards/accuracies": 0.8125, "rewards/chosen": -9.579663276672363, "rewards/margins": 1.3892011642456055, "rewards/rejected": -10.968865394592285, "semantic_entropy": 0.0021100840531289577, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 19.57624931515473, "learning_rate": 1.890210612717401e-07, "logits/chosen": 0.8184317350387573, "logits/rejected": 0.88373863697052, "logps/chosen": -9.572199821472168, "logps/rejected": -11.031925201416016, "loss": 0.3741, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.572199821472168, "rewards/margins": 1.4597254991531372, "rewards/rejected": -11.031925201416016, "semantic_entropy": 0.0016687295865267515, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 24.504791180268253, "learning_rate": 1.8780304655719054e-07, "logits/chosen": 0.8567901849746704, "logits/rejected": 0.9107440114021301, "logps/chosen": -9.613670349121094, "logps/rejected": -11.115751266479492, "loss": 0.3757, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.613670349121094, "rewards/margins": 1.5020800828933716, "rewards/rejected": -11.115751266479492, "semantic_entropy": 0.0012760651297867298, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 35.6519577235443, "learning_rate": 1.865880607788523e-07, "logits/chosen": 0.8858783841133118, "logits/rejected": 0.9201458096504211, "logps/chosen": -9.616140365600586, "logps/rejected": -10.997381210327148, "loss": 0.4086, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.616140365600586, "rewards/margins": 1.3812413215637207, "rewards/rejected": -10.997381210327148, "semantic_entropy": 0.0018040050053969026, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 26.229983342336126, "learning_rate": 1.8537611572452316e-07, "logits/chosen": 0.8341430425643921, "logits/rejected": 0.8626706004142761, "logps/chosen": -9.763750076293945, "logps/rejected": -10.994816780090332, "loss": 0.4061, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.763750076293945, "rewards/margins": 1.2310662269592285, "rewards/rejected": -10.994816780090332, "semantic_entropy": 0.001325559918768704, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 19.660501965854834, "learning_rate": 1.84167223152499e-07, "logits/chosen": 0.8540051579475403, "logits/rejected": 0.913652777671814, "logps/chosen": -9.744295120239258, "logps/rejected": -11.07103157043457, "loss": 0.4022, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.744295120239258, "rewards/margins": 1.3267360925674438, "rewards/rejected": -11.07103157043457, "semantic_entropy": 0.0015319742960855365, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 22.89745562199727, "learning_rate": 1.8296139479146112e-07, "logits/chosen": 0.7796264886856079, "logits/rejected": 0.8492299318313599, "logps/chosen": -9.544806480407715, "logps/rejected": -10.937907218933105, "loss": 0.3939, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.544806480407715, "rewards/margins": 1.3931005001068115, "rewards/rejected": -10.937907218933105, "semantic_entropy": 0.0017392231384292245, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 21.62752883795121, "learning_rate": 1.8175864234036132e-07, "logits/chosen": 0.8781774640083313, "logits/rejected": 0.9093042612075806, "logps/chosen": -9.595394134521484, "logps/rejected": -10.983304977416992, "loss": 0.417, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.595394134521484, "rewards/margins": 1.3879096508026123, "rewards/rejected": -10.983304977416992, "semantic_entropy": 0.0014014368643984199, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 22.389579813909606, "learning_rate": 1.805589774683094e-07, "logits/chosen": 0.7380444407463074, "logits/rejected": 0.7925786972045898, "logps/chosen": -9.564143180847168, "logps/rejected": -10.847589492797852, "loss": 0.3963, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.564143180847168, "rewards/margins": 1.2834450006484985, "rewards/rejected": -10.847589492797852, "semantic_entropy": 0.0015661569777876139, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 23.059520479950827, "learning_rate": 1.79362411814459e-07, "logits/chosen": 0.8489105105400085, "logits/rejected": 0.8289289474487305, "logps/chosen": -9.792860984802246, "logps/rejected": -10.936185836791992, "loss": 0.4414, "rewards/accuracies": 0.8125, "rewards/chosen": -9.792860984802246, "rewards/margins": 1.1433252096176147, "rewards/rejected": -10.936185836791992, "semantic_entropy": 0.0015545317437499762, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 20.613769102957825, "learning_rate": 1.7816895698789552e-07, "logits/chosen": 0.7959033250808716, "logits/rejected": 0.8639119267463684, "logps/chosen": -9.70136833190918, "logps/rejected": -10.945596694946289, "loss": 0.4202, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.70136833190918, "rewards/margins": 1.244227409362793, "rewards/rejected": -10.945596694946289, "semantic_entropy": 0.0013822594191879034, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 17.406251682106838, "learning_rate": 1.7697862456752271e-07, "logits/chosen": 0.7929319143295288, "logits/rejected": 0.8579456210136414, "logps/chosen": -9.719596862792969, "logps/rejected": -11.37035083770752, "loss": 0.3609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.719596862792969, "rewards/margins": 1.6507545709609985, "rewards/rejected": -11.37035083770752, "semantic_entropy": 0.0013113311724737287, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 20.15920926644943, "learning_rate": 1.7579142610195124e-07, "logits/chosen": 0.7851302623748779, "logits/rejected": 0.8529064059257507, "logps/chosen": -9.851489067077637, "logps/rejected": -11.166707038879395, "loss": 0.4203, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.851489067077637, "rewards/margins": 1.3152183294296265, "rewards/rejected": -11.166707038879395, "semantic_entropy": 0.0013941864017397165, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 19.09139226174037, "learning_rate": 1.7460737310938568e-07, "logits/chosen": 0.8212282061576843, "logits/rejected": 0.877540111541748, "logps/chosen": -9.683355331420898, "logps/rejected": -11.168909072875977, "loss": 0.3788, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.683355331420898, "rewards/margins": 1.4855531454086304, "rewards/rejected": -11.168909072875977, "semantic_entropy": 0.0017290354007855058, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 19.755486537220495, "learning_rate": 1.734264770775133e-07, "logits/chosen": 0.7770802974700928, "logits/rejected": 0.8633508682250977, "logps/chosen": -9.642881393432617, "logps/rejected": -11.013988494873047, "loss": 0.4055, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.642881393432617, "rewards/margins": 1.3711069822311401, "rewards/rejected": -11.013988494873047, "semantic_entropy": 0.0014985213056206703, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 21.927676822159636, "learning_rate": 1.7224874946339241e-07, "logits/chosen": 0.8036985397338867, "logits/rejected": 0.8095115423202515, "logps/chosen": -9.7802152633667, "logps/rejected": -11.069761276245117, "loss": 0.4268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.7802152633667, "rewards/margins": 1.289547085762024, "rewards/rejected": -11.069761276245117, "semantic_entropy": 0.0012104662600904703, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 16.00842803469047, "learning_rate": 1.7107420169334186e-07, "logits/chosen": 0.7866020202636719, "logits/rejected": 0.8434419631958008, "logps/chosen": -9.776571273803711, "logps/rejected": -11.023462295532227, "loss": 0.4244, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.776571273803711, "rewards/margins": 1.2468903064727783, "rewards/rejected": -11.023462295532227, "semantic_entropy": 0.0012111186515539885, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 17.323870558007282, "learning_rate": 1.6990284516282893e-07, "logits/chosen": 0.8010492324829102, "logits/rejected": 0.8359723091125488, "logps/chosen": -9.49112319946289, "logps/rejected": -10.895849227905273, "loss": 0.3877, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.49112319946289, "rewards/margins": 1.404726266860962, "rewards/rejected": -10.895849227905273, "semantic_entropy": 0.0014989904593676329, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 17.259101259042957, "learning_rate": 1.687346912363602e-07, "logits/chosen": 0.8071710467338562, "logits/rejected": 0.8544157147407532, "logps/chosen": -9.638090133666992, "logps/rejected": -11.011279106140137, "loss": 0.3814, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.638090133666992, "rewards/margins": 1.3731900453567505, "rewards/rejected": -11.011279106140137, "semantic_entropy": 0.0015442619333043694, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 15.415019825942553, "learning_rate": 1.675697512473697e-07, "logits/chosen": 0.8083289861679077, "logits/rejected": 0.9057637453079224, "logps/chosen": -9.574909210205078, "logps/rejected": -10.998074531555176, "loss": 0.3687, "rewards/accuracies": 0.84375, "rewards/chosen": -9.574909210205078, "rewards/margins": 1.4231641292572021, "rewards/rejected": -10.998074531555176, "semantic_entropy": 0.00155646784696728, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 21.576286115755277, "learning_rate": 1.6640803649811087e-07, "logits/chosen": 0.8501211404800415, "logits/rejected": 0.9308522343635559, "logps/chosen": -9.679555892944336, "logps/rejected": -11.19702434539795, "loss": 0.3827, "rewards/accuracies": 0.8125, "rewards/chosen": -9.679555892944336, "rewards/margins": 1.517469048500061, "rewards/rejected": -11.19702434539795, "semantic_entropy": 0.0016793437534943223, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 26.719598028515872, "learning_rate": 1.6524955825954472e-07, "logits/chosen": 0.8302766680717468, "logits/rejected": 0.8800037503242493, "logps/chosen": -9.66600513458252, "logps/rejected": -10.949085235595703, "loss": 0.4151, "rewards/accuracies": 0.8125, "rewards/chosen": -9.66600513458252, "rewards/margins": 1.2830795049667358, "rewards/rejected": -10.949085235595703, "semantic_entropy": 0.001497269026003778, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 18.483844690586892, "learning_rate": 1.6409432777123277e-07, "logits/chosen": 0.8208200335502625, "logits/rejected": 0.8599546551704407, "logps/chosen": -9.824455261230469, "logps/rejected": -11.335619926452637, "loss": 0.3885, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.824455261230469, "rewards/margins": 1.511163353919983, "rewards/rejected": -11.335619926452637, "semantic_entropy": 0.0013760743895545602, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 20.65468077960879, "learning_rate": 1.6294235624122577e-07, "logits/chosen": 0.8452394604682922, "logits/rejected": 0.9035156965255737, "logps/chosen": -9.813318252563477, "logps/rejected": -11.151365280151367, "loss": 0.4044, "rewards/accuracies": 0.8125, "rewards/chosen": -9.813318252563477, "rewards/margins": 1.338047742843628, "rewards/rejected": -11.151365280151367, "semantic_entropy": 0.0013139288639649749, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 24.257011022001592, "learning_rate": 1.6179365484595697e-07, "logits/chosen": 0.7976378202438354, "logits/rejected": 0.8221977353096008, "logps/chosen": -9.780439376831055, "logps/rejected": -11.116656303405762, "loss": 0.4225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.780439376831055, "rewards/margins": 1.3362162113189697, "rewards/rejected": -11.116656303405762, "semantic_entropy": 0.0013957961928099394, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 22.65516964126615, "learning_rate": 1.60648234730132e-07, "logits/chosen": 0.8350450396537781, "logits/rejected": 0.8612421154975891, "logps/chosen": -9.680601119995117, "logps/rejected": -11.20177936553955, "loss": 0.3626, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.680601119995117, "rewards/margins": 1.5211775302886963, "rewards/rejected": -11.20177936553955, "semantic_entropy": 0.0013702240539714694, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 29.58219434395317, "learning_rate": 1.595061070066222e-07, "logits/chosen": 0.8323311805725098, "logits/rejected": 0.8706264495849609, "logps/chosen": -9.795225143432617, "logps/rejected": -11.236984252929688, "loss": 0.3767, "rewards/accuracies": 0.84375, "rewards/chosen": -9.795225143432617, "rewards/margins": 1.441759467124939, "rewards/rejected": -11.236984252929688, "semantic_entropy": 0.0013441203627735376, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 29.102509702206316, "learning_rate": 1.5836728275635542e-07, "logits/chosen": 0.7748151421546936, "logits/rejected": 0.81391441822052, "logps/chosen": -9.950288772583008, "logps/rejected": -11.165694236755371, "loss": 0.4377, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.950288772583008, "rewards/margins": 1.215405821800232, "rewards/rejected": -11.165694236755371, "semantic_entropy": 0.0010313175152987242, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 21.581567035471323, "learning_rate": 1.5723177302820984e-07, "logits/chosen": 0.8050596117973328, "logits/rejected": 0.8407198190689087, "logps/chosen": -9.88306999206543, "logps/rejected": -11.050642013549805, "loss": 0.4321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.88306999206543, "rewards/margins": 1.167571783065796, "rewards/rejected": -11.050642013549805, "semantic_entropy": 0.0010738309938460588, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 18.06623114943259, "learning_rate": 1.5609958883890544e-07, "logits/chosen": 0.8042596578598022, "logits/rejected": 0.8785734176635742, "logps/chosen": -9.795085906982422, "logps/rejected": -11.104022979736328, "loss": 0.3878, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.795085906982422, "rewards/margins": 1.308937907218933, "rewards/rejected": -11.104022979736328, "semantic_entropy": 0.0012584684882313013, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 24.33590640447122, "learning_rate": 1.5497074117289865e-07, "logits/chosen": 0.7726608514785767, "logits/rejected": 0.8293962478637695, "logps/chosen": -9.73291015625, "logps/rejected": -11.144505500793457, "loss": 0.4062, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.73291015625, "rewards/margins": 1.4115943908691406, "rewards/rejected": -11.144505500793457, "semantic_entropy": 0.002003467408940196, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 19.852698261404328, "learning_rate": 1.5384524098227402e-07, "logits/chosen": 0.8046091198921204, "logits/rejected": 0.8712922930717468, "logps/chosen": -9.885152816772461, "logps/rejected": -11.513572692871094, "loss": 0.3311, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.885152816772461, "rewards/margins": 1.6284195184707642, "rewards/rejected": -11.513572692871094, "semantic_entropy": 0.0014156540855765343, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 23.28791831420902, "learning_rate": 1.5272309918663974e-07, "logits/chosen": 0.7911036610603333, "logits/rejected": 0.8605779409408569, "logps/chosen": -9.974563598632812, "logps/rejected": -11.249377250671387, "loss": 0.467, "rewards/accuracies": 0.75, "rewards/chosen": -9.974563598632812, "rewards/margins": 1.2748134136199951, "rewards/rejected": -11.249377250671387, "semantic_entropy": 0.0012652326840907335, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 18.031587200544166, "learning_rate": 1.516043266730201e-07, "logits/chosen": 0.8097645044326782, "logits/rejected": 0.8588771820068359, "logps/chosen": -9.800325393676758, "logps/rejected": -11.185141563415527, "loss": 0.393, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.800325393676758, "rewards/margins": 1.3848176002502441, "rewards/rejected": -11.185141563415527, "semantic_entropy": 0.0018925167387351394, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 29.399301801147324, "learning_rate": 1.504889342957512e-07, "logits/chosen": 0.7945131063461304, "logits/rejected": 0.8561771512031555, "logps/chosen": -9.814886093139648, "logps/rejected": -11.265652656555176, "loss": 0.4417, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.814886093139648, "rewards/margins": 1.45076584815979, "rewards/rejected": -11.265652656555176, "semantic_entropy": 0.0013577769277617335, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 18.818414850816456, "learning_rate": 1.4937693287637453e-07, "logits/chosen": 0.782823920249939, "logits/rejected": 0.8478276133537292, "logps/chosen": -9.825902938842773, "logps/rejected": -11.110780715942383, "loss": 0.4206, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.825902938842773, "rewards/margins": 1.2848764657974243, "rewards/rejected": -11.110780715942383, "semantic_entropy": 0.0011613890528678894, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 23.834103955908468, "learning_rate": 1.4826833320353305e-07, "logits/chosen": 0.7609673142433167, "logits/rejected": 0.8239792585372925, "logps/chosen": -9.851580619812012, "logps/rejected": -11.248353958129883, "loss": 0.385, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.851580619812012, "rewards/margins": 1.3967727422714233, "rewards/rejected": -11.248353958129883, "semantic_entropy": 0.0012605976080521941, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 23.686138337629355, "learning_rate": 1.4716314603286528e-07, "logits/chosen": 0.8113320469856262, "logits/rejected": 0.853225588798523, "logps/chosen": -9.844433784484863, "logps/rejected": -11.266097068786621, "loss": 0.4029, "rewards/accuracies": 0.78125, "rewards/chosen": -9.844433784484863, "rewards/margins": 1.4216625690460205, "rewards/rejected": -11.266097068786621, "semantic_entropy": 0.0013559302315115929, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 25.669925692295536, "learning_rate": 1.4606138208690233e-07, "logits/chosen": 0.7266643643379211, "logits/rejected": 0.8064903020858765, "logps/chosen": -9.897780418395996, "logps/rejected": -11.229866981506348, "loss": 0.4261, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.897780418395996, "rewards/margins": 1.3320866823196411, "rewards/rejected": -11.229866981506348, "semantic_entropy": 0.0012010873761028051, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 24.94777596239184, "learning_rate": 1.4496305205496251e-07, "logits/chosen": 0.796136200428009, "logits/rejected": 0.8669074177742004, "logps/chosen": -9.997017860412598, "logps/rejected": -11.391059875488281, "loss": 0.3978, "rewards/accuracies": 0.8125, "rewards/chosen": -9.997017860412598, "rewards/margins": 1.3940420150756836, "rewards/rejected": -11.391059875488281, "semantic_entropy": 0.0011204956099390984, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 25.67221825991835, "learning_rate": 1.4386816659304895e-07, "logits/chosen": 0.7781258225440979, "logits/rejected": 0.8242195248603821, "logps/chosen": -9.795356750488281, "logps/rejected": -11.114994049072266, "loss": 0.3966, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.795356750488281, "rewards/margins": 1.3196370601654053, "rewards/rejected": -11.114994049072266, "semantic_entropy": 0.0012639164924621582, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 29.57080046877201, "learning_rate": 1.4277673632374492e-07, "logits/chosen": 0.7557036876678467, "logits/rejected": 0.8132128715515137, "logps/chosen": -9.90876579284668, "logps/rejected": -11.25818920135498, "loss": 0.4118, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.90876579284668, "rewards/margins": 1.3494237661361694, "rewards/rejected": -11.25818920135498, "semantic_entropy": 0.0013076277682557702, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 20.78894701815076, "learning_rate": 1.416887718361119e-07, "logits/chosen": 0.8289060592651367, "logits/rejected": 0.8358928561210632, "logps/chosen": -9.96793270111084, "logps/rejected": -11.24679183959961, "loss": 0.4306, "rewards/accuracies": 0.8125, "rewards/chosen": -9.96793270111084, "rewards/margins": 1.2788599729537964, "rewards/rejected": -11.24679183959961, "semantic_entropy": 0.0013636414660140872, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 26.519372471909644, "learning_rate": 1.406042836855859e-07, "logits/chosen": 0.8788352012634277, "logits/rejected": 0.9096766710281372, "logps/chosen": -9.882646560668945, "logps/rejected": -11.346869468688965, "loss": 0.3805, "rewards/accuracies": 0.84375, "rewards/chosen": -9.882646560668945, "rewards/margins": 1.464221715927124, "rewards/rejected": -11.346869468688965, "semantic_entropy": 0.0013449579710140824, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 26.805430797019667, "learning_rate": 1.3952328239387595e-07, "logits/chosen": 0.7472053170204163, "logits/rejected": 0.8391642570495605, "logps/chosen": -9.822053909301758, "logps/rejected": -11.334062576293945, "loss": 0.3744, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.822053909301758, "rewards/margins": 1.5120099782943726, "rewards/rejected": -11.334062576293945, "semantic_entropy": 0.0012804374564439058, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 23.745463280246494, "learning_rate": 1.3844577844886109e-07, "logits/chosen": 0.8212148547172546, "logits/rejected": 0.9034450650215149, "logps/chosen": -9.849560737609863, "logps/rejected": -11.316389083862305, "loss": 0.3922, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.849560737609863, "rewards/margins": 1.4668283462524414, "rewards/rejected": -11.316389083862305, "semantic_entropy": 0.0014727965462952852, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 24.765118352803277, "learning_rate": 1.3737178230448955e-07, "logits/chosen": 0.706335723400116, "logits/rejected": 0.7600988149642944, "logps/chosen": -10.072949409484863, "logps/rejected": -11.255311965942383, "loss": 0.4585, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.072949409484863, "rewards/margins": 1.1823631525039673, "rewards/rejected": -11.255311965942383, "semantic_entropy": 0.0010486546671018004, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 16.53037049152132, "learning_rate": 1.363013043806764e-07, "logits/chosen": 0.8160565495491028, "logits/rejected": 0.8825929760932922, "logps/chosen": -9.718530654907227, "logps/rejected": -11.098315238952637, "loss": 0.375, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.718530654907227, "rewards/margins": 1.3797847032546997, "rewards/rejected": -11.098315238952637, "semantic_entropy": 0.0014672328252345324, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 19.343996568570635, "learning_rate": 1.352343550632034e-07, "logits/chosen": 0.8211394548416138, "logits/rejected": 0.8589351773262024, "logps/chosen": -9.79681396484375, "logps/rejected": -11.286918640136719, "loss": 0.4092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.79681396484375, "rewards/margins": 1.4901044368743896, "rewards/rejected": -11.286918640136719, "semantic_entropy": 0.0013513191370293498, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 18.946006963734423, "learning_rate": 1.3417094470361722e-07, "logits/chosen": 0.777470052242279, "logits/rejected": 0.8312585949897766, "logps/chosen": -9.910394668579102, "logps/rejected": -11.160832405090332, "loss": 0.428, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.910394668579102, "rewards/margins": 1.2504370212554932, "rewards/rejected": -11.160832405090332, "semantic_entropy": 0.0012193446746096015, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.9214699268341064, "eval_logits/rejected": 0.9721218943595886, "eval_logps/chosen": -9.951480865478516, "eval_logps/rejected": -11.088980674743652, "eval_loss": 0.5250210762023926, "eval_rewards/accuracies": 0.721068263053894, "eval_rewards/chosen": -9.951480865478516, "eval_rewards/margins": 1.1374988555908203, "eval_rewards/rejected": -11.088980674743652, "eval_runtime": 35.1208, "eval_samples_per_second": 38.296, "eval_semantic_entropy": 0.0012979113962501287, "eval_steps_per_second": 9.595, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 24.25396278476502, "learning_rate": 1.3311108361913015e-07, "logits/chosen": 0.7317711114883423, "logits/rejected": 0.7822341322898865, "logps/chosen": -9.765314102172852, "logps/rejected": -11.11705207824707, "loss": 0.3892, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.765314102172852, "rewards/margins": 1.3517379760742188, "rewards/rejected": -11.11705207824707, "semantic_entropy": 0.001415650942362845, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 16.799881334103528, "learning_rate": 1.3205478209251874e-07, "logits/chosen": 0.8066733479499817, "logits/rejected": 0.9000295400619507, "logps/chosen": -9.963193893432617, "logps/rejected": -11.469663619995117, "loss": 0.3841, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.963193893432617, "rewards/margins": 1.5064703226089478, "rewards/rejected": -11.469663619995117, "semantic_entropy": 0.001393306301906705, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 22.10955918100729, "learning_rate": 1.310020503720254e-07, "logits/chosen": 0.7781058549880981, "logits/rejected": 0.822067141532898, "logps/chosen": -9.866239547729492, "logps/rejected": -11.280614852905273, "loss": 0.4068, "rewards/accuracies": 0.8125, "rewards/chosen": -9.866239547729492, "rewards/margins": 1.4143754243850708, "rewards/rejected": -11.280614852905273, "semantic_entropy": 0.0012620962224900723, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 24.95675503053961, "learning_rate": 1.2995289867125752e-07, "logits/chosen": 0.7621157765388489, "logits/rejected": 0.7959357500076294, "logps/chosen": -9.751391410827637, "logps/rejected": -10.94892692565918, "loss": 0.4499, "rewards/accuracies": 0.78125, "rewards/chosen": -9.751391410827637, "rewards/margins": 1.1975345611572266, "rewards/rejected": -10.94892692565918, "semantic_entropy": 0.001342722331173718, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 22.719615672561844, "learning_rate": 1.2890733716908986e-07, "logits/chosen": 0.7777091264724731, "logits/rejected": 0.860866367816925, "logps/chosen": -9.617403030395508, "logps/rejected": -11.080734252929688, "loss": 0.3296, "rewards/accuracies": 0.875, "rewards/chosen": -9.617403030395508, "rewards/margins": 1.4633299112319946, "rewards/rejected": -11.080734252929688, "semantic_entropy": 0.0015527913346886635, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 22.213207378400554, "learning_rate": 1.2786537600956454e-07, "logits/chosen": 0.7948800325393677, "logits/rejected": 0.8321939706802368, "logps/chosen": -9.664536476135254, "logps/rejected": -11.117993354797363, "loss": 0.3919, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.664536476135254, "rewards/margins": 1.4534571170806885, "rewards/rejected": -11.117993354797363, "semantic_entropy": 0.0012703756801784039, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 17.553736783320986, "learning_rate": 1.268270253017933e-07, "logits/chosen": 0.8016083836555481, "logits/rejected": 0.8781582117080688, "logps/chosen": -9.703906059265137, "logps/rejected": -11.203841209411621, "loss": 0.3737, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.703906059265137, "rewards/margins": 1.4999356269836426, "rewards/rejected": -11.203841209411621, "semantic_entropy": 0.0014606801560148597, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 20.144746908461087, "learning_rate": 1.257922951198591e-07, "logits/chosen": 0.7229622602462769, "logits/rejected": 0.8273760676383972, "logps/chosen": -9.712352752685547, "logps/rejected": -11.075556755065918, "loss": 0.4149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.712352752685547, "rewards/margins": 1.363203763961792, "rewards/rejected": -11.075556755065918, "semantic_entropy": 0.0015043210005387664, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 21.29344289082141, "learning_rate": 1.24761195502719e-07, "logits/chosen": 0.765488862991333, "logits/rejected": 0.8269311189651489, "logps/chosen": -9.819905281066895, "logps/rejected": -11.093830108642578, "loss": 0.4315, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.819905281066895, "rewards/margins": 1.2739253044128418, "rewards/rejected": -11.093830108642578, "semantic_entropy": 0.0011897517833858728, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 31.858698430890097, "learning_rate": 1.2373373645410573e-07, "logits/chosen": 0.8017476797103882, "logits/rejected": 0.8760465383529663, "logps/chosen": -9.925119400024414, "logps/rejected": -11.373433113098145, "loss": 0.4195, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.925119400024414, "rewards/margins": 1.4483143091201782, "rewards/rejected": -11.373433113098145, "semantic_entropy": 0.001272709690965712, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 21.93569257046853, "learning_rate": 1.2270992794243175e-07, "logits/chosen": 0.7383990287780762, "logits/rejected": 0.8073341250419617, "logps/chosen": -9.690180778503418, "logps/rejected": -11.078446388244629, "loss": 0.4162, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.690180778503418, "rewards/margins": 1.3882659673690796, "rewards/rejected": -11.078446388244629, "semantic_entropy": 0.0013964849058538675, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 19.97322050449576, "learning_rate": 1.2168977990069147e-07, "logits/chosen": 0.7342582941055298, "logits/rejected": 0.7973549962043762, "logps/chosen": -9.67171573638916, "logps/rejected": -10.98813533782959, "loss": 0.4229, "rewards/accuracies": 0.75, "rewards/chosen": -9.67171573638916, "rewards/margins": 1.316420316696167, "rewards/rejected": -10.98813533782959, "semantic_entropy": 0.001446625916287303, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 20.71353603983738, "learning_rate": 1.206733022263659e-07, "logits/chosen": 0.7267228960990906, "logits/rejected": 0.8412584066390991, "logps/chosen": -9.8917875289917, "logps/rejected": -11.24048137664795, "loss": 0.4238, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.8917875289917, "rewards/margins": 1.3486926555633545, "rewards/rejected": -11.24048137664795, "semantic_entropy": 0.0015950720990076661, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 24.253708429812765, "learning_rate": 1.1966050478132572e-07, "logits/chosen": 0.7586138844490051, "logits/rejected": 0.8224746584892273, "logps/chosen": -9.68405532836914, "logps/rejected": -11.180296897888184, "loss": 0.3864, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.68405532836914, "rewards/margins": 1.4962437152862549, "rewards/rejected": -11.180296897888184, "semantic_entropy": 0.0014107396127656102, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 21.753207135798707, "learning_rate": 1.1865139739173635e-07, "logits/chosen": 0.7383859753608704, "logits/rejected": 0.8205004930496216, "logps/chosen": -9.776418685913086, "logps/rejected": -11.116586685180664, "loss": 0.4023, "rewards/accuracies": 0.84375, "rewards/chosen": -9.776418685913086, "rewards/margins": 1.34016752243042, "rewards/rejected": -11.116586685180664, "semantic_entropy": 0.0015196467284113169, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 37.45298900218617, "learning_rate": 1.1764598984796187e-07, "logits/chosen": 0.7673597931861877, "logits/rejected": 0.8331373333930969, "logps/chosen": -9.831031799316406, "logps/rejected": -11.022607803344727, "loss": 0.4243, "rewards/accuracies": 0.8125, "rewards/chosen": -9.831031799316406, "rewards/margins": 1.1915762424468994, "rewards/rejected": -11.022607803344727, "semantic_entropy": 0.001290981424972415, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 29.063993723072343, "learning_rate": 1.1664429190447095e-07, "logits/chosen": 0.7792296409606934, "logits/rejected": 0.8261978030204773, "logps/chosen": -9.763978958129883, "logps/rejected": -11.260164260864258, "loss": 0.3626, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.763978958129883, "rewards/margins": 1.496183156967163, "rewards/rejected": -11.260164260864258, "semantic_entropy": 0.0017484973650425673, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 21.46210084893414, "learning_rate": 1.1564631327974122e-07, "logits/chosen": 0.7814306616783142, "logits/rejected": 0.8631542921066284, "logps/chosen": -9.915163040161133, "logps/rejected": -11.228841781616211, "loss": 0.4225, "rewards/accuracies": 0.78125, "rewards/chosen": -9.915163040161133, "rewards/margins": 1.3136794567108154, "rewards/rejected": -11.228841781616211, "semantic_entropy": 0.0011776359751820564, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 20.87464472480962, "learning_rate": 1.1465206365616587e-07, "logits/chosen": 0.6937421560287476, "logits/rejected": 0.7896022796630859, "logps/chosen": -9.791691780090332, "logps/rejected": -11.08985710144043, "loss": 0.4082, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.791691780090332, "rewards/margins": 1.2981641292572021, "rewards/rejected": -11.08985710144043, "semantic_entropy": 0.0014528365572914481, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 22.141016604754245, "learning_rate": 1.1366155267995887e-07, "logits/chosen": 0.8213682174682617, "logits/rejected": 0.8454049825668335, "logps/chosen": -9.781519889831543, "logps/rejected": -11.107604026794434, "loss": 0.3978, "rewards/accuracies": 0.8125, "rewards/chosen": -9.781519889831543, "rewards/margins": 1.3260858058929443, "rewards/rejected": -11.107604026794434, "semantic_entropy": 0.0014172891387715936, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 20.81249553944227, "learning_rate": 1.1267478996106228e-07, "logits/chosen": 0.8247060775756836, "logits/rejected": 0.9181084632873535, "logps/chosen": -9.852422714233398, "logps/rejected": -11.161924362182617, "loss": 0.4153, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.852422714233398, "rewards/margins": 1.3095014095306396, "rewards/rejected": -11.161924362182617, "semantic_entropy": 0.0011818426428362727, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 22.065604053110732, "learning_rate": 1.116917850730521e-07, "logits/chosen": 0.7819138765335083, "logits/rejected": 0.8265460133552551, "logps/chosen": -9.933201789855957, "logps/rejected": -11.143549919128418, "loss": 0.5036, "rewards/accuracies": 0.75, "rewards/chosen": -9.933201789855957, "rewards/margins": 1.2103482484817505, "rewards/rejected": -11.143549919128418, "semantic_entropy": 0.0012393039651215076, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 17.547878202239335, "learning_rate": 1.1071254755304637e-07, "logits/chosen": 0.7445524334907532, "logits/rejected": 0.7736660242080688, "logps/chosen": -9.694409370422363, "logps/rejected": -11.00683307647705, "loss": 0.42, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.694409370422363, "rewards/margins": 1.3124234676361084, "rewards/rejected": -11.00683307647705, "semantic_entropy": 0.001383893541060388, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 23.631179882315553, "learning_rate": 1.0973708690161143e-07, "logits/chosen": 0.792984664440155, "logits/rejected": 0.8252687454223633, "logps/chosen": -9.863832473754883, "logps/rejected": -11.273421287536621, "loss": 0.3964, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.863832473754883, "rewards/margins": 1.409589409828186, "rewards/rejected": -11.273421287536621, "semantic_entropy": 0.001443797373212874, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 31.566956401950897, "learning_rate": 1.0876541258267119e-07, "logits/chosen": 0.7816181182861328, "logits/rejected": 0.873005747795105, "logps/chosen": -9.954813003540039, "logps/rejected": -11.395962715148926, "loss": 0.3918, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.954813003540039, "rewards/margins": 1.4411489963531494, "rewards/rejected": -11.395962715148926, "semantic_entropy": 0.0011875508353114128, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 25.748276815440583, "learning_rate": 1.0779753402341379e-07, "logits/chosen": 0.7940434813499451, "logits/rejected": 0.840873122215271, "logps/chosen": -9.88911247253418, "logps/rejected": -11.033079147338867, "loss": 0.4659, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.88911247253418, "rewards/margins": 1.1439659595489502, "rewards/rejected": -11.033079147338867, "semantic_entropy": 0.0013449281686916947, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 23.532526327852437, "learning_rate": 1.0683346061420157e-07, "logits/chosen": 0.8855890035629272, "logits/rejected": 0.8981055021286011, "logps/chosen": -9.778970718383789, "logps/rejected": -11.14280891418457, "loss": 0.4195, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.778970718383789, "rewards/margins": 1.363840103149414, "rewards/rejected": -11.14280891418457, "semantic_entropy": 0.001345223980024457, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 23.144715325749804, "learning_rate": 1.0587320170847874e-07, "logits/chosen": 0.7933780550956726, "logits/rejected": 0.8676679730415344, "logps/chosen": -9.759759902954102, "logps/rejected": -10.895282745361328, "loss": 0.4728, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.759759902954102, "rewards/margins": 1.135524034500122, "rewards/rejected": -10.895282745361328, "semantic_entropy": 0.001288101659156382, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 21.014853352663092, "learning_rate": 1.0491676662268156e-07, "logits/chosen": 0.8033139109611511, "logits/rejected": 0.8606590032577515, "logps/chosen": -9.787649154663086, "logps/rejected": -11.041508674621582, "loss": 0.4454, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.787649154663086, "rewards/margins": 1.2538607120513916, "rewards/rejected": -11.041508674621582, "semantic_entropy": 0.0012479587458074093, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 25.574697677651937, "learning_rate": 1.0396416463614732e-07, "logits/chosen": 0.7537301182746887, "logits/rejected": 0.814228355884552, "logps/chosen": -9.702180862426758, "logps/rejected": -11.04191780090332, "loss": 0.42, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.702180862426758, "rewards/margins": 1.3397365808486938, "rewards/rejected": -11.04191780090332, "semantic_entropy": 0.001320059527643025, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 24.618729215771705, "learning_rate": 1.0301540499102479e-07, "logits/chosen": 0.7519547343254089, "logits/rejected": 0.8329681158065796, "logps/chosen": -9.94709587097168, "logps/rejected": -10.988082885742188, "loss": 0.4853, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.94709587097168, "rewards/margins": 1.0409865379333496, "rewards/rejected": -10.988082885742188, "semantic_entropy": 0.0011719849426299334, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 26.17462652294696, "learning_rate": 1.0207049689218405e-07, "logits/chosen": 0.7849665284156799, "logits/rejected": 0.845086932182312, "logps/chosen": -9.825363159179688, "logps/rejected": -11.26807975769043, "loss": 0.4079, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.825363159179688, "rewards/margins": 1.4427168369293213, "rewards/rejected": -11.26807975769043, "semantic_entropy": 0.0016312900697812438, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 19.057899173583554, "learning_rate": 1.0112944950712782e-07, "logits/chosen": 0.7082661986351013, "logits/rejected": 0.7645975947380066, "logps/chosen": -9.70044994354248, "logps/rejected": -11.13469123840332, "loss": 0.3632, "rewards/accuracies": 0.8125, "rewards/chosen": -9.70044994354248, "rewards/margins": 1.434242606163025, "rewards/rejected": -11.13469123840332, "semantic_entropy": 0.001436132937669754, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 22.488252138217725, "learning_rate": 1.0019227196590174e-07, "logits/chosen": 0.8336771130561829, "logits/rejected": 0.8841003179550171, "logps/chosen": -9.908154487609863, "logps/rejected": -11.13020133972168, "loss": 0.4829, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.908154487609863, "rewards/margins": 1.2220475673675537, "rewards/rejected": -11.13020133972168, "semantic_entropy": 0.0014596920227631927, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 23.13587206070571, "learning_rate": 9.925897336100664e-08, "logits/chosen": 0.8057346343994141, "logits/rejected": 0.850749135017395, "logps/chosen": -9.688154220581055, "logps/rejected": -11.131349563598633, "loss": 0.3866, "rewards/accuracies": 0.8125, "rewards/chosen": -9.688154220581055, "rewards/margins": 1.4431952238082886, "rewards/rejected": -11.131349563598633, "semantic_entropy": 0.0014849931467324495, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 23.357693931056765, "learning_rate": 9.832956274730946e-08, "logits/chosen": 0.7591571807861328, "logits/rejected": 0.7910270094871521, "logps/chosen": -9.584843635559082, "logps/rejected": -10.765449523925781, "loss": 0.4539, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.584843635559082, "rewards/margins": 1.1806063652038574, "rewards/rejected": -10.765449523925781, "semantic_entropy": 0.0015608349349349737, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 20.637887868629868, "learning_rate": 9.740404914195633e-08, "logits/chosen": 0.7534157037734985, "logits/rejected": 0.842387855052948, "logps/chosen": -9.789365768432617, "logps/rejected": -11.138287544250488, "loss": 0.4176, "rewards/accuracies": 0.84375, "rewards/chosen": -9.789365768432617, "rewards/margins": 1.3489205837249756, "rewards/rejected": -11.138287544250488, "semantic_entropy": 0.0012669655261561275, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 18.543572299518026, "learning_rate": 9.648244152428392e-08, "logits/chosen": 0.7632014751434326, "logits/rejected": 0.8216020464897156, "logps/chosen": -9.654337882995605, "logps/rejected": -10.88366413116455, "loss": 0.4224, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.654337882995605, "rewards/margins": 1.229326605796814, "rewards/rejected": -10.88366413116455, "semantic_entropy": 0.0014172986848279834, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 19.149278895897503, "learning_rate": 9.556474883573379e-08, "logits/chosen": 0.7528376579284668, "logits/rejected": 0.8152421116828918, "logps/chosen": -9.646097183227539, "logps/rejected": -11.058183670043945, "loss": 0.4187, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.646097183227539, "rewards/margins": 1.412088394165039, "rewards/rejected": -11.058183670043945, "semantic_entropy": 0.0015735877677798271, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 21.187119688281765, "learning_rate": 9.465097997976412e-08, "logits/chosen": 0.7996068000793457, "logits/rejected": 0.8711563348770142, "logps/chosen": -9.832134246826172, "logps/rejected": -11.303122520446777, "loss": 0.3744, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.832134246826172, "rewards/margins": 1.4709880352020264, "rewards/rejected": -11.303122520446777, "semantic_entropy": 0.0013540387153625488, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 21.004241219678757, "learning_rate": 9.374114382176457e-08, "logits/chosen": 0.7817031741142273, "logits/rejected": 0.8433337211608887, "logps/chosen": -9.771738052368164, "logps/rejected": -11.173564910888672, "loss": 0.3894, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.771738052368164, "rewards/margins": 1.4018254280090332, "rewards/rejected": -11.173564910888672, "semantic_entropy": 0.0012005962198600173, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 27.55976528492471, "learning_rate": 9.283524918896945e-08, "logits/chosen": 0.7919789552688599, "logits/rejected": 0.8178110122680664, "logps/chosen": -9.770918846130371, "logps/rejected": -11.095891952514648, "loss": 0.437, "rewards/accuracies": 0.78125, "rewards/chosen": -9.770918846130371, "rewards/margins": 1.3249746561050415, "rewards/rejected": -11.095891952514648, "semantic_entropy": 0.0011873061303049326, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 20.74798992950919, "learning_rate": 9.193330487037232e-08, "logits/chosen": 0.814818263053894, "logits/rejected": 0.8907683491706848, "logps/chosen": -9.838384628295898, "logps/rejected": -11.240675926208496, "loss": 0.3874, "rewards/accuracies": 0.8125, "rewards/chosen": -9.838384628295898, "rewards/margins": 1.402291178703308, "rewards/rejected": -11.240675926208496, "semantic_entropy": 0.0014131965581327677, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 17.77642999066438, "learning_rate": 9.103531961664118e-08, "logits/chosen": 0.7889447808265686, "logits/rejected": 0.8778635859489441, "logps/chosen": -9.612691879272461, "logps/rejected": -10.984048843383789, "loss": 0.3752, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.612691879272461, "rewards/margins": 1.3713561296463013, "rewards/rejected": -10.984048843383789, "semantic_entropy": 0.0014132572105154395, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 19.68502083527493, "learning_rate": 9.014130214003269e-08, "logits/chosen": 0.7648957967758179, "logits/rejected": 0.7786797881126404, "logps/chosen": -9.624971389770508, "logps/rejected": -11.085968971252441, "loss": 0.3885, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.624971389770508, "rewards/margins": 1.460997223854065, "rewards/rejected": -11.085968971252441, "semantic_entropy": 0.0014543391298502684, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 23.013655732727454, "learning_rate": 8.925126111430848e-08, "logits/chosen": 0.7716919183731079, "logits/rejected": 0.8139322996139526, "logps/chosen": -9.422433853149414, "logps/rejected": -10.843701362609863, "loss": 0.4095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.422433853149414, "rewards/margins": 1.4212672710418701, "rewards/rejected": -10.843701362609863, "semantic_entropy": 0.001683591166511178, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 24.587592896382663, "learning_rate": 8.83652051746504e-08, "logits/chosen": 0.9284713864326477, "logits/rejected": 0.9669192433357239, "logps/chosen": -9.819892883300781, "logps/rejected": -11.273547172546387, "loss": 0.3939, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.819892883300781, "rewards/margins": 1.4536547660827637, "rewards/rejected": -11.273547172546387, "semantic_entropy": 0.0012030914658680558, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 22.016927492062443, "learning_rate": 8.748314291757696e-08, "logits/chosen": 0.7996488213539124, "logits/rejected": 0.8577510714530945, "logps/chosen": -9.611989974975586, "logps/rejected": -10.817550659179688, "loss": 0.4296, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.611989974975586, "rewards/margins": 1.2055622339248657, "rewards/rejected": -10.817550659179688, "semantic_entropy": 0.0014349967241287231, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 20.553055577709493, "learning_rate": 8.660508290086032e-08, "logits/chosen": 0.8431406021118164, "logits/rejected": 0.930561900138855, "logps/chosen": -9.651968002319336, "logps/rejected": -11.155494689941406, "loss": 0.3923, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.651968002319336, "rewards/margins": 1.5035268068313599, "rewards/rejected": -11.155494689941406, "semantic_entropy": 0.001411119825206697, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 29.66982937376926, "learning_rate": 8.573103364344231e-08, "logits/chosen": 0.7703269124031067, "logits/rejected": 0.8427858352661133, "logps/chosen": -9.545438766479492, "logps/rejected": -10.993854522705078, "loss": 0.3839, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.545438766479492, "rewards/margins": 1.4484152793884277, "rewards/rejected": -10.993854522705078, "semantic_entropy": 0.0015610662521794438, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 24.578094627007825, "learning_rate": 8.486100362535292e-08, "logits/chosen": 0.7740985751152039, "logits/rejected": 0.851282000541687, "logps/chosen": -9.825765609741211, "logps/rejected": -10.998079299926758, "loss": 0.4317, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.825765609741211, "rewards/margins": 1.1723124980926514, "rewards/rejected": -10.998079299926758, "semantic_entropy": 0.0011104957666248083, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 17.212403265377716, "learning_rate": 8.399500128762693e-08, "logits/chosen": 0.7384323477745056, "logits/rejected": 0.809241771697998, "logps/chosen": -9.808893203735352, "logps/rejected": -11.134016036987305, "loss": 0.4052, "rewards/accuracies": 0.78125, "rewards/chosen": -9.808893203735352, "rewards/margins": 1.3251229524612427, "rewards/rejected": -11.134016036987305, "semantic_entropy": 0.001255923300050199, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 24.084031431969617, "learning_rate": 8.313303503222313e-08, "logits/chosen": 0.8113842010498047, "logits/rejected": 0.8640966415405273, "logps/chosen": -9.65959644317627, "logps/rejected": -10.88883113861084, "loss": 0.4217, "rewards/accuracies": 0.78125, "rewards/chosen": -9.65959644317627, "rewards/margins": 1.2292344570159912, "rewards/rejected": -10.88883113861084, "semantic_entropy": 0.0014709953684359789, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 23.42979933597834, "learning_rate": 8.227511322194164e-08, "logits/chosen": 0.8243430256843567, "logits/rejected": 0.8697830438613892, "logps/chosen": -9.677289962768555, "logps/rejected": -10.889700889587402, "loss": 0.4429, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.677289962768555, "rewards/margins": 1.212410807609558, "rewards/rejected": -10.889700889587402, "semantic_entropy": 0.001211336930282414, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 19.33238977643887, "learning_rate": 8.142124418034385e-08, "logits/chosen": 0.830100417137146, "logits/rejected": 0.8942376971244812, "logps/chosen": -9.691996574401855, "logps/rejected": -11.031505584716797, "loss": 0.4332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.691996574401855, "rewards/margins": 1.3395094871520996, "rewards/rejected": -11.031505584716797, "semantic_entropy": 0.0013690624618902802, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 23.119082399990962, "learning_rate": 8.057143619167073e-08, "logits/chosen": 0.8294227719306946, "logits/rejected": 0.8717254400253296, "logps/chosen": -9.509129524230957, "logps/rejected": -10.85628890991211, "loss": 0.41, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.509129524230957, "rewards/margins": 1.3471596240997314, "rewards/rejected": -10.85628890991211, "semantic_entropy": 0.0018529357621446252, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 18.56557158174028, "learning_rate": 7.97256975007633e-08, "logits/chosen": 0.795819878578186, "logits/rejected": 0.88841712474823, "logps/chosen": -9.543716430664062, "logps/rejected": -10.928507804870605, "loss": 0.3959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.543716430664062, "rewards/margins": 1.3847920894622803, "rewards/rejected": -10.928507804870605, "semantic_entropy": 0.0014773935545235872, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 26.78815904135646, "learning_rate": 7.888403631298186e-08, "logits/chosen": 0.7813885807991028, "logits/rejected": 0.8535317182540894, "logps/chosen": -9.660378456115723, "logps/rejected": -10.954367637634277, "loss": 0.4392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.660378456115723, "rewards/margins": 1.2939906120300293, "rewards/rejected": -10.954367637634277, "semantic_entropy": 0.0014350914862006903, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 20.289843532833586, "learning_rate": 7.804646079412719e-08, "logits/chosen": 0.8242961168289185, "logits/rejected": 0.9029603004455566, "logps/chosen": -9.712553977966309, "logps/rejected": -11.0477933883667, "loss": 0.4122, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.712553977966309, "rewards/margins": 1.335240125656128, "rewards/rejected": -11.0477933883667, "semantic_entropy": 0.0014915402280166745, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 21.47761754271104, "learning_rate": 7.72129790703604e-08, "logits/chosen": 0.787671685218811, "logits/rejected": 0.8374601602554321, "logps/chosen": -9.7283353805542, "logps/rejected": -10.967700004577637, "loss": 0.4222, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.7283353805542, "rewards/margins": 1.2393652200698853, "rewards/rejected": -10.967700004577637, "semantic_entropy": 0.001365487463772297, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 28.03895601653979, "learning_rate": 7.638359922812504e-08, "logits/chosen": 0.776307225227356, "logits/rejected": 0.8199490308761597, "logps/chosen": -9.557718276977539, "logps/rejected": -10.877527236938477, "loss": 0.4126, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.557718276977539, "rewards/margins": 1.3198084831237793, "rewards/rejected": -10.877527236938477, "semantic_entropy": 0.0016608207952231169, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 32.868150571212254, "learning_rate": 7.555832931406774e-08, "logits/chosen": 0.7730585336685181, "logits/rejected": 0.8562465906143188, "logps/chosen": -9.679393768310547, "logps/rejected": -11.042935371398926, "loss": 0.4244, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.679393768310547, "rewards/margins": 1.3635411262512207, "rewards/rejected": -11.042935371398926, "semantic_entropy": 0.0014725655782967806, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 18.781224798445212, "learning_rate": 7.47371773349611e-08, "logits/chosen": 0.8205526471138, "logits/rejected": 0.8565570712089539, "logps/chosen": -9.75233268737793, "logps/rejected": -11.291008949279785, "loss": 0.3533, "rewards/accuracies": 0.84375, "rewards/chosen": -9.75233268737793, "rewards/margins": 1.5386755466461182, "rewards/rejected": -11.291008949279785, "semantic_entropy": 0.0013560467632487416, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 28.052854143232477, "learning_rate": 7.392015125762496e-08, "logits/chosen": 0.7241109609603882, "logits/rejected": 0.8222753405570984, "logps/chosen": -9.689494132995605, "logps/rejected": -11.097482681274414, "loss": 0.3869, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.689494132995605, "rewards/margins": 1.4079889059066772, "rewards/rejected": -11.097482681274414, "semantic_entropy": 0.0011607788037508726, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 25.150937293751188, "learning_rate": 7.310725900885018e-08, "logits/chosen": 0.7780320048332214, "logits/rejected": 0.8369150161743164, "logps/chosen": -9.633565902709961, "logps/rejected": -10.885080337524414, "loss": 0.4725, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.633565902709961, "rewards/margins": 1.2515143156051636, "rewards/rejected": -10.885080337524414, "semantic_entropy": 0.001628419035114348, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 22.549685697406634, "learning_rate": 7.229850847532076e-08, "logits/chosen": 0.8130934834480286, "logits/rejected": 0.9138733744621277, "logps/chosen": -9.572754859924316, "logps/rejected": -11.118395805358887, "loss": 0.3436, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.572754859924316, "rewards/margins": 1.5456407070159912, "rewards/rejected": -11.118395805358887, "semantic_entropy": 0.0017241360619664192, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 22.715970370218447, "learning_rate": 7.149390750353779e-08, "logits/chosen": 0.8037542104721069, "logits/rejected": 0.8449680209159851, "logps/chosen": -9.80845832824707, "logps/rejected": -11.075779914855957, "loss": 0.4049, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.80845832824707, "rewards/margins": 1.2673219442367554, "rewards/rejected": -11.075779914855957, "semantic_entropy": 0.0013479054905474186, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 21.870424101570975, "learning_rate": 7.069346389974374e-08, "logits/chosen": 0.80865877866745, "logits/rejected": 0.8610594868659973, "logps/chosen": -9.720281600952148, "logps/rejected": -11.03128433227539, "loss": 0.3992, "rewards/accuracies": 0.875, "rewards/chosen": -9.720281600952148, "rewards/margins": 1.3110027313232422, "rewards/rejected": -11.03128433227539, "semantic_entropy": 0.0014346633106470108, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 25.565083159355556, "learning_rate": 6.989718542984563e-08, "logits/chosen": 0.7875592708587646, "logits/rejected": 0.8164467811584473, "logps/chosen": -9.870200157165527, "logps/rejected": -11.153292655944824, "loss": 0.4386, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.870200157165527, "rewards/margins": 1.2830924987792969, "rewards/rejected": -11.153292655944824, "semantic_entropy": 0.0011865177657455206, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 23.70159485340149, "learning_rate": 6.9105079819341e-08, "logits/chosen": 0.7858568429946899, "logits/rejected": 0.8397903442382812, "logps/chosen": -9.606379508972168, "logps/rejected": -11.173693656921387, "loss": 0.3656, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.606379508972168, "rewards/margins": 1.5673143863677979, "rewards/rejected": -11.173693656921387, "semantic_entropy": 0.0015011833747848868, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 20.488177720347142, "learning_rate": 6.831715475324163e-08, "logits/chosen": 0.7883397936820984, "logits/rejected": 0.8317297101020813, "logps/chosen": -9.790312767028809, "logps/rejected": -11.212163925170898, "loss": 0.4019, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.790312767028809, "rewards/margins": 1.4218522310256958, "rewards/rejected": -11.212163925170898, "semantic_entropy": 0.0015070982044562697, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 19.639448727845746, "learning_rate": 6.753341787600026e-08, "logits/chosen": 0.7966683506965637, "logits/rejected": 0.8559118509292603, "logps/chosen": -9.566210746765137, "logps/rejected": -11.087980270385742, "loss": 0.3618, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.566210746765137, "rewards/margins": 1.5217713117599487, "rewards/rejected": -11.087980270385742, "semantic_entropy": 0.0015061668818816543, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 23.858385047679327, "learning_rate": 6.67538767914353e-08, "logits/chosen": 0.791594922542572, "logits/rejected": 0.8555682301521301, "logps/chosen": -9.7157564163208, "logps/rejected": -10.862947463989258, "loss": 0.4575, "rewards/accuracies": 0.78125, "rewards/chosen": -9.7157564163208, "rewards/margins": 1.1471917629241943, "rewards/rejected": -10.862947463989258, "semantic_entropy": 0.0014351477148011327, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 23.875748432774547, "learning_rate": 6.597853906265793e-08, "logits/chosen": 0.8073896169662476, "logits/rejected": 0.850189208984375, "logps/chosen": -9.71528434753418, "logps/rejected": -11.275640487670898, "loss": 0.3913, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.71528434753418, "rewards/margins": 1.56035578250885, "rewards/rejected": -11.275640487670898, "semantic_entropy": 0.0013745089527219534, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 24.624000051473708, "learning_rate": 6.5207412211998e-08, "logits/chosen": 0.8650287389755249, "logits/rejected": 0.9158161282539368, "logps/chosen": -9.759730339050293, "logps/rejected": -11.222911834716797, "loss": 0.4193, "rewards/accuracies": 0.8125, "rewards/chosen": -9.759730339050293, "rewards/margins": 1.463181495666504, "rewards/rejected": -11.222911834716797, "semantic_entropy": 0.0013576913625001907, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 18.880924959816383, "learning_rate": 6.444050372093186e-08, "logits/chosen": 0.753667950630188, "logits/rejected": 0.8446556329727173, "logps/chosen": -9.776894569396973, "logps/rejected": -11.069523811340332, "loss": 0.4017, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.776894569396973, "rewards/margins": 1.292628526687622, "rewards/rejected": -11.069523811340332, "semantic_entropy": 0.0013450583210214972, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 26.78240030796786, "learning_rate": 6.367782103000873e-08, "logits/chosen": 0.8099533319473267, "logits/rejected": 0.8473021388053894, "logps/chosen": -9.640462875366211, "logps/rejected": -10.797101974487305, "loss": 0.4493, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.640462875366211, "rewards/margins": 1.1566379070281982, "rewards/rejected": -10.797101974487305, "semantic_entropy": 0.0015895968535915017, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 27.727951802633132, "learning_rate": 6.29193715387798e-08, "logits/chosen": 0.7754964828491211, "logits/rejected": 0.8192489743232727, "logps/chosen": -9.690164566040039, "logps/rejected": -11.208440780639648, "loss": 0.4116, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.690164566040039, "rewards/margins": 1.5182764530181885, "rewards/rejected": -11.208440780639648, "semantic_entropy": 0.0017889321316033602, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 28.149971668259315, "learning_rate": 6.216516260572502e-08, "logits/chosen": 0.7809394001960754, "logits/rejected": 0.8340757489204407, "logps/chosen": -9.840954780578613, "logps/rejected": -11.24518871307373, "loss": 0.3951, "rewards/accuracies": 0.8125, "rewards/chosen": -9.840954780578613, "rewards/margins": 1.4042353630065918, "rewards/rejected": -11.24518871307373, "semantic_entropy": 0.0015198871260508895, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 17.6357809314226, "learning_rate": 6.141520154818297e-08, "logits/chosen": 0.8027510643005371, "logits/rejected": 0.8235558271408081, "logps/chosen": -9.668852806091309, "logps/rejected": -10.926243782043457, "loss": 0.4394, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.668852806091309, "rewards/margins": 1.257389783859253, "rewards/rejected": -10.926243782043457, "semantic_entropy": 0.001651085214689374, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.8550450205802917, "eval_logits/rejected": 0.8955670595169067, "eval_logps/chosen": -9.817285537719727, "eval_logps/rejected": -10.942052841186523, "eval_loss": 0.5237716436386108, "eval_rewards/accuracies": 0.7255192995071411, "eval_rewards/chosen": -9.817285537719727, "eval_rewards/margins": 1.1247663497924805, "eval_rewards/rejected": -10.942052841186523, "eval_runtime": 35.1465, "eval_samples_per_second": 38.268, "eval_semantic_entropy": 0.0013976304326206446, "eval_steps_per_second": 9.588, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 25.642642407075105, "learning_rate": 6.066949564227897e-08, "logits/chosen": 0.7796936631202698, "logits/rejected": 0.817895233631134, "logps/chosen": -9.595781326293945, "logps/rejected": -10.862689018249512, "loss": 0.4646, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.595781326293945, "rewards/margins": 1.2669070959091187, "rewards/rejected": -10.862689018249512, "semantic_entropy": 0.0013227377785369754, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 20.93852604421033, "learning_rate": 5.992805212285523e-08, "logits/chosen": 0.777945339679718, "logits/rejected": 0.7996780872344971, "logps/chosen": -9.620034217834473, "logps/rejected": -11.079931259155273, "loss": 0.3832, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.620034217834473, "rewards/margins": 1.4598976373672485, "rewards/rejected": -11.079931259155273, "semantic_entropy": 0.0017585292225703597, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 24.39765980999833, "learning_rate": 5.9190878183399684e-08, "logits/chosen": 0.8418199419975281, "logits/rejected": 0.8500019311904907, "logps/chosen": -9.530296325683594, "logps/rejected": -10.991520881652832, "loss": 0.4437, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.530296325683594, "rewards/margins": 1.4612245559692383, "rewards/rejected": -10.991520881652832, "semantic_entropy": 0.0018125723581761122, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 31.63637936690902, "learning_rate": 5.845798097597748e-08, "logits/chosen": 0.8116466403007507, "logits/rejected": 0.8796448707580566, "logps/chosen": -9.765870094299316, "logps/rejected": -10.956972122192383, "loss": 0.4451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.765870094299316, "rewards/margins": 1.191102385520935, "rewards/rejected": -10.956972122192383, "semantic_entropy": 0.0012761508114635944, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 27.871272030425462, "learning_rate": 5.772936761116026e-08, "logits/chosen": 0.8403164148330688, "logits/rejected": 0.900818943977356, "logps/chosen": -9.660100936889648, "logps/rejected": -10.994760513305664, "loss": 0.4122, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.660100936889648, "rewards/margins": 1.3346589803695679, "rewards/rejected": -10.994760513305664, "semantic_entropy": 0.001608129939995706, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 28.248867607037017, "learning_rate": 5.700504515795829e-08, "logits/chosen": 0.8395519256591797, "logits/rejected": 0.8937468528747559, "logps/chosen": -9.703470230102539, "logps/rejected": -11.014467239379883, "loss": 0.413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.703470230102539, "rewards/margins": 1.3109973669052124, "rewards/rejected": -11.014467239379883, "semantic_entropy": 0.0014707682421430945, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 25.030373000459893, "learning_rate": 5.628502064375101e-08, "logits/chosen": 0.7156926393508911, "logits/rejected": 0.783743679523468, "logps/chosen": -9.621664047241211, "logps/rejected": -11.060919761657715, "loss": 0.351, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -9.621664047241211, "rewards/margins": 1.439256191253662, "rewards/rejected": -11.060919761657715, "semantic_entropy": 0.001427180483005941, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 26.228171272523518, "learning_rate": 5.55693010542197e-08, "logits/chosen": 0.7665778398513794, "logits/rejected": 0.8383312225341797, "logps/chosen": -9.543096542358398, "logps/rejected": -11.02253532409668, "loss": 0.3674, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.543096542358398, "rewards/margins": 1.4794379472732544, "rewards/rejected": -11.02253532409668, "semantic_entropy": 0.001682286150753498, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 28.09304993416799, "learning_rate": 5.485789333327856e-08, "logits/chosen": 0.7801726460456848, "logits/rejected": 0.8014837503433228, "logps/chosen": -9.684420585632324, "logps/rejected": -10.936495780944824, "loss": 0.4075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.684420585632324, "rewards/margins": 1.2520757913589478, "rewards/rejected": -10.936495780944824, "semantic_entropy": 0.0015050426591187716, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 23.215944833323043, "learning_rate": 5.4150804383008675e-08, "logits/chosen": 0.7506409883499146, "logits/rejected": 0.8083317875862122, "logps/chosen": -9.687183380126953, "logps/rejected": -11.075380325317383, "loss": 0.4257, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.687183380126953, "rewards/margins": 1.3881968259811401, "rewards/rejected": -11.075380325317383, "semantic_entropy": 0.0012998328311368823, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 26.1051895667939, "learning_rate": 5.344804106359002e-08, "logits/chosen": 0.8464560508728027, "logits/rejected": 0.884229302406311, "logps/chosen": -9.554250717163086, "logps/rejected": -10.965102195739746, "loss": 0.3927, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.554250717163086, "rewards/margins": 1.410851240158081, "rewards/rejected": -10.965102195739746, "semantic_entropy": 0.0016194203635677695, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 29.62614133583001, "learning_rate": 5.274961019323559e-08, "logits/chosen": 0.7584127187728882, "logits/rejected": 0.7833465337753296, "logps/chosen": -9.556783676147461, "logps/rejected": -10.777464866638184, "loss": 0.4411, "rewards/accuracies": 0.8125, "rewards/chosen": -9.556783676147461, "rewards/margins": 1.2206814289093018, "rewards/rejected": -10.777464866638184, "semantic_entropy": 0.002015589503571391, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 19.381379819748208, "learning_rate": 5.205551854812451e-08, "logits/chosen": 0.8153206706047058, "logits/rejected": 0.8267370462417603, "logps/chosen": -9.816014289855957, "logps/rejected": -11.20833683013916, "loss": 0.4169, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.816014289855957, "rewards/margins": 1.3923231363296509, "rewards/rejected": -11.20833683013916, "semantic_entropy": 0.001474303426221013, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 16.778112141427197, "learning_rate": 5.1365772862337177e-08, "logits/chosen": 0.8045045137405396, "logits/rejected": 0.8898431658744812, "logps/chosen": -9.492764472961426, "logps/rejected": -11.15455436706543, "loss": 0.3243, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.492764472961426, "rewards/margins": 1.6617908477783203, "rewards/rejected": -11.15455436706543, "semantic_entropy": 0.001532680937089026, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 22.352970563490228, "learning_rate": 5.068037982778905e-08, "logits/chosen": 0.8180361986160278, "logits/rejected": 0.8790004849433899, "logps/chosen": -9.450529098510742, "logps/rejected": -10.990362167358398, "loss": 0.3905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.450529098510742, "rewards/margins": 1.539833426475525, "rewards/rejected": -10.990362167358398, "semantic_entropy": 0.0015029583591967821, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 20.68826893781038, "learning_rate": 4.999934609416656e-08, "logits/chosen": 0.9028242826461792, "logits/rejected": 0.9264825582504272, "logps/chosen": -9.678912162780762, "logps/rejected": -11.107706069946289, "loss": 0.3986, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.678912162780762, "rewards/margins": 1.4287939071655273, "rewards/rejected": -11.107706069946289, "semantic_entropy": 0.0014561197021976113, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 23.193032984362397, "learning_rate": 4.932267826886183e-08, "logits/chosen": 0.8201519846916199, "logits/rejected": 0.8877601623535156, "logps/chosen": -9.788244247436523, "logps/rejected": -11.236588478088379, "loss": 0.3943, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.788244247436523, "rewards/margins": 1.4483439922332764, "rewards/rejected": -11.236588478088379, "semantic_entropy": 0.0011878965888172388, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 22.557556114089028, "learning_rate": 4.8650382916909206e-08, "logits/chosen": 0.7988881468772888, "logits/rejected": 0.8320202827453613, "logps/chosen": -9.691407203674316, "logps/rejected": -11.068517684936523, "loss": 0.4254, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.691407203674316, "rewards/margins": 1.3771107196807861, "rewards/rejected": -11.068517684936523, "semantic_entropy": 0.001317240297794342, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 19.898200763361565, "learning_rate": 4.7982466560920976e-08, "logits/chosen": 0.7807987928390503, "logits/rejected": 0.8615278005599976, "logps/chosen": -9.802709579467773, "logps/rejected": -11.011950492858887, "loss": 0.4279, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.802709579467773, "rewards/margins": 1.2092421054840088, "rewards/rejected": -11.011950492858887, "semantic_entropy": 0.0015492916572839022, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 23.69089776610765, "learning_rate": 4.7318935681024685e-08, "logits/chosen": 0.7918484807014465, "logits/rejected": 0.8997529149055481, "logps/chosen": -9.777336120605469, "logps/rejected": -11.20526123046875, "loss": 0.374, "rewards/accuracies": 0.8125, "rewards/chosen": -9.777336120605469, "rewards/margins": 1.4279241561889648, "rewards/rejected": -11.20526123046875, "semantic_entropy": 0.0012134136632084846, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 20.406979801252664, "learning_rate": 4.6659796714799745e-08, "logits/chosen": 0.795091986656189, "logits/rejected": 0.8641031384468079, "logps/chosen": -9.73538589477539, "logps/rejected": -11.23878288269043, "loss": 0.3381, "rewards/accuracies": 0.875, "rewards/chosen": -9.73538589477539, "rewards/margins": 1.5033972263336182, "rewards/rejected": -11.23878288269043, "semantic_entropy": 0.0015479883877560496, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 19.21550189332208, "learning_rate": 4.60050560572155e-08, "logits/chosen": 0.7698862552642822, "logits/rejected": 0.8082722425460815, "logps/chosen": -9.636996269226074, "logps/rejected": -11.186447143554688, "loss": 0.3999, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.636996269226074, "rewards/margins": 1.5494511127471924, "rewards/rejected": -11.186447143554688, "semantic_entropy": 0.0014376682229340076, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 23.312400032128476, "learning_rate": 4.535472006056834e-08, "logits/chosen": 0.8084294199943542, "logits/rejected": 0.8658881187438965, "logps/chosen": -9.69934368133545, "logps/rejected": -10.961128234863281, "loss": 0.4383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.69934368133545, "rewards/margins": 1.261784553527832, "rewards/rejected": -10.961128234863281, "semantic_entropy": 0.0016576785128563643, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 22.36582731335697, "learning_rate": 4.470879503442132e-08, "logits/chosen": 0.8091555833816528, "logits/rejected": 0.8557069897651672, "logps/chosen": -9.803995132446289, "logps/rejected": -11.197736740112305, "loss": 0.3964, "rewards/accuracies": 0.8125, "rewards/chosen": -9.803995132446289, "rewards/margins": 1.3937435150146484, "rewards/rejected": -11.197736740112305, "semantic_entropy": 0.0012877520639449358, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 18.952340559608157, "learning_rate": 4.406728724554154e-08, "logits/chosen": 0.7486631870269775, "logits/rejected": 0.8488380312919617, "logps/chosen": -9.719161987304688, "logps/rejected": -11.249533653259277, "loss": 0.3694, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.719161987304688, "rewards/margins": 1.5303723812103271, "rewards/rejected": -11.249533653259277, "semantic_entropy": 0.001469378243200481, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 18.013629249072153, "learning_rate": 4.3430202917840664e-08, "logits/chosen": 0.8312109708786011, "logits/rejected": 0.9005948901176453, "logps/chosen": -9.830782890319824, "logps/rejected": -11.317447662353516, "loss": 0.3873, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.830782890319824, "rewards/margins": 1.4866645336151123, "rewards/rejected": -11.317447662353516, "semantic_entropy": 0.0013012022245675325, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 26.68369648236472, "learning_rate": 4.279754823231346e-08, "logits/chosen": 0.8236324191093445, "logits/rejected": 0.898714542388916, "logps/chosen": -9.688699722290039, "logps/rejected": -11.057371139526367, "loss": 0.4296, "rewards/accuracies": 0.8125, "rewards/chosen": -9.688699722290039, "rewards/margins": 1.368671178817749, "rewards/rejected": -11.057371139526367, "semantic_entropy": 0.0014394777826964855, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 19.214329700454428, "learning_rate": 4.216932932697859e-08, "logits/chosen": 0.7843598127365112, "logits/rejected": 0.8269468545913696, "logps/chosen": -9.618779182434082, "logps/rejected": -10.83703899383545, "loss": 0.4075, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.618779182434082, "rewards/margins": 1.2182590961456299, "rewards/rejected": -10.83703899383545, "semantic_entropy": 0.0018105891067534685, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 25.567099589733154, "learning_rate": 4.154555229681844e-08, "logits/chosen": 0.771405041217804, "logits/rejected": 0.867265522480011, "logps/chosen": -9.69874382019043, "logps/rejected": -11.158080101013184, "loss": 0.3747, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.69874382019043, "rewards/margins": 1.4593359231948853, "rewards/rejected": -11.158080101013184, "semantic_entropy": 0.0013225203147158027, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 21.387358904048636, "learning_rate": 4.092622319372069e-08, "logits/chosen": 0.8330507278442383, "logits/rejected": 0.8846317529678345, "logps/chosen": -9.71510124206543, "logps/rejected": -11.14268684387207, "loss": 0.3984, "rewards/accuracies": 0.8125, "rewards/chosen": -9.71510124206543, "rewards/margins": 1.4275856018066406, "rewards/rejected": -11.14268684387207, "semantic_entropy": 0.0013751887017861009, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 23.151467653095995, "learning_rate": 4.031134802641889e-08, "logits/chosen": 0.8044828176498413, "logits/rejected": 0.8498908877372742, "logps/chosen": -9.883355140686035, "logps/rejected": -11.208585739135742, "loss": 0.4105, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.883355140686035, "rewards/margins": 1.325231909751892, "rewards/rejected": -11.208585739135742, "semantic_entropy": 0.0015358685050159693, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 21.04274720664362, "learning_rate": 3.970093276043468e-08, "logits/chosen": 0.8240159749984741, "logits/rejected": 0.9003788828849792, "logps/chosen": -9.615509986877441, "logps/rejected": -11.031620025634766, "loss": 0.3856, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.615509986877441, "rewards/margins": 1.4161105155944824, "rewards/rejected": -11.031620025634766, "semantic_entropy": 0.0017490362515673041, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 39.33191380791592, "learning_rate": 3.9094983318019584e-08, "logits/chosen": 0.7968525290489197, "logits/rejected": 0.8308472633361816, "logps/chosen": -9.674779891967773, "logps/rejected": -11.145748138427734, "loss": 0.371, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.674779891967773, "rewards/margins": 1.4709681272506714, "rewards/rejected": -11.145748138427734, "semantic_entropy": 0.0014223111793398857, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 20.85099205117232, "learning_rate": 3.849350557809789e-08, "logits/chosen": 0.8421157002449036, "logits/rejected": 0.8929288983345032, "logps/chosen": -9.53125, "logps/rejected": -10.943530082702637, "loss": 0.3826, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.53125, "rewards/margins": 1.4122816324234009, "rewards/rejected": -10.943530082702637, "semantic_entropy": 0.0014043385162949562, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 22.939506081463986, "learning_rate": 3.789650537620903e-08, "logits/chosen": 0.8108006715774536, "logits/rejected": 0.8519940376281738, "logps/chosen": -9.818662643432617, "logps/rejected": -11.12476921081543, "loss": 0.3931, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.818662643432617, "rewards/margins": 1.3061046600341797, "rewards/rejected": -11.12476921081543, "semantic_entropy": 0.0010739094577729702, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 22.129521362188676, "learning_rate": 3.730398850445182e-08, "logits/chosen": 0.822609543800354, "logits/rejected": 0.8529809713363647, "logps/chosen": -9.925312995910645, "logps/rejected": -11.192630767822266, "loss": 0.4489, "rewards/accuracies": 0.78125, "rewards/chosen": -9.925312995910645, "rewards/margins": 1.2673180103302002, "rewards/rejected": -11.192630767822266, "semantic_entropy": 0.0011812245938926935, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 24.19733690749803, "learning_rate": 3.671596071142735e-08, "logits/chosen": 0.8324605226516724, "logits/rejected": 0.9051470756530762, "logps/chosen": -9.69133472442627, "logps/rejected": -11.151666641235352, "loss": 0.4529, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.69133472442627, "rewards/margins": 1.460331678390503, "rewards/rejected": -11.151666641235352, "semantic_entropy": 0.0018878221744671464, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 31.252423984045087, "learning_rate": 3.6132427702183996e-08, "logits/chosen": 0.8355720639228821, "logits/rejected": 0.8954976797103882, "logps/chosen": -9.62411880493164, "logps/rejected": -11.132128715515137, "loss": 0.361, "rewards/accuracies": 0.84375, "rewards/chosen": -9.62411880493164, "rewards/margins": 1.5080082416534424, "rewards/rejected": -11.132128715515137, "semantic_entropy": 0.0016464665532112122, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 21.68508018940414, "learning_rate": 3.555339513816147e-08, "logits/chosen": 0.7898616790771484, "logits/rejected": 0.8546016812324524, "logps/chosen": -9.857865333557129, "logps/rejected": -11.005608558654785, "loss": 0.4645, "rewards/accuracies": 0.78125, "rewards/chosen": -9.857865333557129, "rewards/margins": 1.147742748260498, "rewards/rejected": -11.005608558654785, "semantic_entropy": 0.0011803485685959458, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 21.328951358859364, "learning_rate": 3.497886863713639e-08, "logits/chosen": 0.8253191113471985, "logits/rejected": 0.8587236404418945, "logps/chosen": -9.829444885253906, "logps/rejected": -11.261645317077637, "loss": 0.4094, "rewards/accuracies": 0.8125, "rewards/chosen": -9.829444885253906, "rewards/margins": 1.4322013854980469, "rewards/rejected": -11.261645317077637, "semantic_entropy": 0.0012271823361515999, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 25.75553736028734, "learning_rate": 3.440885377316721e-08, "logits/chosen": 0.8507275581359863, "logits/rejected": 0.8877654075622559, "logps/chosen": -9.808802604675293, "logps/rejected": -10.979841232299805, "loss": 0.438, "rewards/accuracies": 0.78125, "rewards/chosen": -9.808802604675293, "rewards/margins": 1.1710389852523804, "rewards/rejected": -10.979841232299805, "semantic_entropy": 0.0013294884702190757, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 29.51695593361045, "learning_rate": 3.384335607654082e-08, "logits/chosen": 0.8268327713012695, "logits/rejected": 0.8833521008491516, "logps/chosen": -9.724630355834961, "logps/rejected": -11.137059211730957, "loss": 0.3859, "rewards/accuracies": 0.84375, "rewards/chosen": -9.724630355834961, "rewards/margins": 1.412428617477417, "rewards/rejected": -11.137059211730957, "semantic_entropy": 0.0016626717988401651, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 20.483800947742463, "learning_rate": 3.328238103371811e-08, "logits/chosen": 0.8099279403686523, "logits/rejected": 0.8540660738945007, "logps/chosen": -9.710822105407715, "logps/rejected": -11.191596984863281, "loss": 0.3804, "rewards/accuracies": 0.8125, "rewards/chosen": -9.710822105407715, "rewards/margins": 1.4807744026184082, "rewards/rejected": -11.191596984863281, "semantic_entropy": 0.0015045705949887633, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 26.229172740059667, "learning_rate": 3.272593408728169e-08, "logits/chosen": 0.8173542022705078, "logits/rejected": 0.8689044117927551, "logps/chosen": -9.657730102539062, "logps/rejected": -10.887810707092285, "loss": 0.4424, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.657730102539062, "rewards/margins": 1.230080008506775, "rewards/rejected": -10.887810707092285, "semantic_entropy": 0.0013948578853160143, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 19.40648296473702, "learning_rate": 3.217402063588204e-08, "logits/chosen": 0.7883289456367493, "logits/rejected": 0.8523383140563965, "logps/chosen": -9.800715446472168, "logps/rejected": -11.160974502563477, "loss": 0.4105, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.800715446472168, "rewards/margins": 1.360258936882019, "rewards/rejected": -11.160974502563477, "semantic_entropy": 0.00112335872836411, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 20.18654274201843, "learning_rate": 3.162664603418608e-08, "logits/chosen": 0.8560435175895691, "logits/rejected": 0.8896482586860657, "logps/chosen": -9.66085147857666, "logps/rejected": -11.121121406555176, "loss": 0.3676, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.66085147857666, "rewards/margins": 1.4602700471878052, "rewards/rejected": -11.121121406555176, "semantic_entropy": 0.0015098705189302564, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 27.638003566187923, "learning_rate": 3.1083815592824416e-08, "logits/chosen": 0.8065202832221985, "logits/rejected": 0.8954984545707703, "logps/chosen": -9.99959945678711, "logps/rejected": -11.314790725708008, "loss": 0.4151, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.99959945678711, "rewards/margins": 1.3151907920837402, "rewards/rejected": -11.314790725708008, "semantic_entropy": 0.0012680039508268237, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 22.574139063646903, "learning_rate": 3.054553457834053e-08, "logits/chosen": 0.8925463557243347, "logits/rejected": 0.9036859273910522, "logps/chosen": -9.90015983581543, "logps/rejected": -11.163106918334961, "loss": 0.4213, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.90015983581543, "rewards/margins": 1.262947678565979, "rewards/rejected": -11.163106918334961, "semantic_entropy": 0.0010416943114250898, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 23.894727539187592, "learning_rate": 3.0011808213139036e-08, "logits/chosen": 0.8361862301826477, "logits/rejected": 0.8620640635490417, "logps/chosen": -9.725111961364746, "logps/rejected": -11.048177719116211, "loss": 0.3983, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.725111961364746, "rewards/margins": 1.3230668306350708, "rewards/rejected": -11.048177719116211, "semantic_entropy": 0.001452545402571559, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 22.35978287732217, "learning_rate": 2.948264167543568e-08, "logits/chosen": 0.7902384996414185, "logits/rejected": 0.8329121470451355, "logps/chosen": -9.640339851379395, "logps/rejected": -10.89880657196045, "loss": 0.4069, "rewards/accuracies": 0.8125, "rewards/chosen": -9.640339851379395, "rewards/margins": 1.258466362953186, "rewards/rejected": -10.89880657196045, "semantic_entropy": 0.0015184081858024001, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 21.602439650044992, "learning_rate": 2.8958040099206216e-08, "logits/chosen": 0.7853751182556152, "logits/rejected": 0.8521712422370911, "logps/chosen": -9.592992782592773, "logps/rejected": -11.00520133972168, "loss": 0.3884, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.592992782592773, "rewards/margins": 1.4122079610824585, "rewards/rejected": -11.00520133972168, "semantic_entropy": 0.0015211288118734956, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 24.292700666304096, "learning_rate": 2.843800857413775e-08, "logits/chosen": 0.8210417628288269, "logits/rejected": 0.8545898199081421, "logps/chosen": -9.68048095703125, "logps/rejected": -10.930859565734863, "loss": 0.4632, "rewards/accuracies": 0.75, "rewards/chosen": -9.68048095703125, "rewards/margins": 1.2503786087036133, "rewards/rejected": -10.930859565734863, "semantic_entropy": 0.001456740777939558, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 23.853271664665353, "learning_rate": 2.7922552145578203e-08, "logits/chosen": 0.8457640409469604, "logits/rejected": 0.8957823514938354, "logps/chosen": -9.459394454956055, "logps/rejected": -10.84212589263916, "loss": 0.4069, "rewards/accuracies": 0.8125, "rewards/chosen": -9.459394454956055, "rewards/margins": 1.3827307224273682, "rewards/rejected": -10.84212589263916, "semantic_entropy": 0.0017677752766758204, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 27.366391781200583, "learning_rate": 2.7411675814488277e-08, "logits/chosen": 0.8697333335876465, "logits/rejected": 0.9225956201553345, "logps/chosen": -9.678709983825684, "logps/rejected": -10.979662895202637, "loss": 0.3868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.678709983825684, "rewards/margins": 1.3009527921676636, "rewards/rejected": -10.979662895202637, "semantic_entropy": 0.0014675845159217715, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 29.099080644827772, "learning_rate": 2.690538453739216e-08, "logits/chosen": 0.8708797693252563, "logits/rejected": 0.8982332348823547, "logps/chosen": -9.632844924926758, "logps/rejected": -10.764936447143555, "loss": 0.4623, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.632844924926758, "rewards/margins": 1.132093071937561, "rewards/rejected": -10.764936447143555, "semantic_entropy": 0.0013683564029633999, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 23.320063008009125, "learning_rate": 2.6403683226330298e-08, "logits/chosen": 0.8062912225723267, "logits/rejected": 0.8842616081237793, "logps/chosen": -9.81375503540039, "logps/rejected": -11.180780410766602, "loss": 0.3998, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.81375503540039, "rewards/margins": 1.3670246601104736, "rewards/rejected": -11.180780410766602, "semantic_entropy": 0.0012563010677695274, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 27.02131423176778, "learning_rate": 2.5906576748810804e-08, "logits/chosen": 0.8369554281234741, "logits/rejected": 0.8760555386543274, "logps/chosen": -9.69217300415039, "logps/rejected": -11.273096084594727, "loss": 0.3291, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.69217300415039, "rewards/margins": 1.580923318862915, "rewards/rejected": -11.273096084594727, "semantic_entropy": 0.0013889706460759044, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 26.902505403388286, "learning_rate": 2.5414069927763016e-08, "logits/chosen": 0.8435298800468445, "logits/rejected": 0.8922918438911438, "logps/chosen": -9.850217819213867, "logps/rejected": -11.199501037597656, "loss": 0.4054, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.850217819213867, "rewards/margins": 1.3492811918258667, "rewards/rejected": -11.199501037597656, "semantic_entropy": 0.0012866712640970945, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 23.139963064688857, "learning_rate": 2.4926167541490185e-08, "logits/chosen": 0.7457908987998962, "logits/rejected": 0.8107954859733582, "logps/chosen": -9.701014518737793, "logps/rejected": -11.19543743133545, "loss": 0.4072, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.701014518737793, "rewards/margins": 1.4944229125976562, "rewards/rejected": -11.19543743133545, "semantic_entropy": 0.001561012351885438, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 18.505427836623596, "learning_rate": 2.4442874323623574e-08, "logits/chosen": 0.8288080096244812, "logits/rejected": 0.852368950843811, "logps/chosen": -9.831713676452637, "logps/rejected": -11.238375663757324, "loss": 0.4368, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.831713676452637, "rewards/margins": 1.4066613912582397, "rewards/rejected": -11.238375663757324, "semantic_entropy": 0.0013758750865235925, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 24.20695474770458, "learning_rate": 2.396419496307589e-08, "logits/chosen": 0.7989641427993774, "logits/rejected": 0.8495559692382812, "logps/chosen": -9.894341468811035, "logps/rejected": -11.228631019592285, "loss": 0.3974, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.894341468811035, "rewards/margins": 1.3342888355255127, "rewards/rejected": -11.228631019592285, "semantic_entropy": 0.0011284537613391876, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 19.561878478981807, "learning_rate": 2.349013410399653e-08, "logits/chosen": 0.7845159769058228, "logits/rejected": 0.8294118046760559, "logps/chosen": -9.763102531433105, "logps/rejected": -11.050474166870117, "loss": 0.4596, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.763102531433105, "rewards/margins": 1.2873718738555908, "rewards/rejected": -11.050474166870117, "semantic_entropy": 0.0012428943300619721, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 26.484826469549404, "learning_rate": 2.3020696345725954e-08, "logits/chosen": 0.7876384258270264, "logits/rejected": 0.8591636419296265, "logps/chosen": -9.837101936340332, "logps/rejected": -11.317276954650879, "loss": 0.348, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.837101936340332, "rewards/margins": 1.4801758527755737, "rewards/rejected": -11.317276954650879, "semantic_entropy": 0.0012740811798721552, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 26.768124375085126, "learning_rate": 2.2555886242751398e-08, "logits/chosen": 0.8366681933403015, "logits/rejected": 0.8936346173286438, "logps/chosen": -9.770502090454102, "logps/rejected": -11.170347213745117, "loss": 0.3935, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.770502090454102, "rewards/margins": 1.3998456001281738, "rewards/rejected": -11.170347213745117, "semantic_entropy": 0.0012818884570151567, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 34.15098903260704, "learning_rate": 2.2095708304662453e-08, "logits/chosen": 0.7642577886581421, "logits/rejected": 0.8753819465637207, "logps/chosen": -9.696569442749023, "logps/rejected": -11.150983810424805, "loss": 0.3862, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.696569442749023, "rewards/margins": 1.454413652420044, "rewards/rejected": -11.150983810424805, "semantic_entropy": 0.0013747283956035972, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 28.8525710173051, "learning_rate": 2.16401669961076e-08, "logits/chosen": 0.7833540439605713, "logits/rejected": 0.855305552482605, "logps/chosen": -9.691442489624023, "logps/rejected": -11.090289115905762, "loss": 0.4098, "rewards/accuracies": 0.8125, "rewards/chosen": -9.691442489624023, "rewards/margins": 1.3988467454910278, "rewards/rejected": -11.090289115905762, "semantic_entropy": 0.001538719516247511, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 30.720388291606127, "learning_rate": 2.1189266736750532e-08, "logits/chosen": 0.8673465847969055, "logits/rejected": 0.9190985560417175, "logps/chosen": -9.79682445526123, "logps/rejected": -11.017160415649414, "loss": 0.4352, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.79682445526123, "rewards/margins": 1.2203348875045776, "rewards/rejected": -11.017160415649414, "semantic_entropy": 0.0019034147262573242, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 19.26413991827598, "learning_rate": 2.0743011901227623e-08, "logits/chosen": 0.8775045275688171, "logits/rejected": 0.948703944683075, "logps/chosen": -9.918733596801758, "logps/rejected": -11.272817611694336, "loss": 0.3995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.918733596801758, "rewards/margins": 1.3540844917297363, "rewards/rejected": -11.272817611694336, "semantic_entropy": 0.0010751333320513368, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 27.791159491160563, "learning_rate": 2.030140681910508e-08, "logits/chosen": 0.8398303985595703, "logits/rejected": 0.8916828036308289, "logps/chosen": -9.831799507141113, "logps/rejected": -11.197306632995605, "loss": 0.4334, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.831799507141113, "rewards/margins": 1.3655065298080444, "rewards/rejected": -11.197306632995605, "semantic_entropy": 0.0013978518545627594, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 26.235313395338775, "learning_rate": 1.986445577483753e-08, "logits/chosen": 0.8134158849716187, "logits/rejected": 0.8556219935417175, "logps/chosen": -9.731077194213867, "logps/rejected": -11.126041412353516, "loss": 0.4181, "rewards/accuracies": 0.78125, "rewards/chosen": -9.731077194213867, "rewards/margins": 1.394963026046753, "rewards/rejected": -11.126041412353516, "semantic_entropy": 0.00145871308632195, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 19.605206487487308, "learning_rate": 1.9432163007725765e-08, "logits/chosen": 0.7870944738388062, "logits/rejected": 0.8299382925033569, "logps/chosen": -9.642583847045898, "logps/rejected": -11.064803123474121, "loss": 0.4027, "rewards/accuracies": 0.8125, "rewards/chosen": -9.642583847045898, "rewards/margins": 1.4222198724746704, "rewards/rejected": -11.064803123474121, "semantic_entropy": 0.0014933927450329065, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 18.551143666002904, "learning_rate": 1.9004532711876297e-08, "logits/chosen": 0.7672029137611389, "logits/rejected": 0.8150334358215332, "logps/chosen": -9.65953254699707, "logps/rejected": -11.068132400512695, "loss": 0.3776, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.65953254699707, "rewards/margins": 1.4086004495620728, "rewards/rejected": -11.068132400512695, "semantic_entropy": 0.0015167773235589266, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 28.399641966852776, "learning_rate": 1.8581569036159928e-08, "logits/chosen": 0.8048108220100403, "logits/rejected": 0.8365411758422852, "logps/chosen": -9.653702735900879, "logps/rejected": -11.053384780883789, "loss": 0.4278, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.653702735900879, "rewards/margins": 1.3996822834014893, "rewards/rejected": -11.053384780883789, "semantic_entropy": 0.0016002919292077422, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 22.352042394265855, "learning_rate": 1.8163276084172285e-08, "logits/chosen": 0.8579298257827759, "logits/rejected": 0.9403360486030579, "logps/chosen": -10.103005409240723, "logps/rejected": -11.47459602355957, "loss": 0.3877, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -10.103005409240723, "rewards/margins": 1.3715909719467163, "rewards/rejected": -11.47459602355957, "semantic_entropy": 0.001051284489221871, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 25.499811878639434, "learning_rate": 1.7749657914193194e-08, "logits/chosen": 0.8347901105880737, "logits/rejected": 0.9103593826293945, "logps/chosen": -9.974831581115723, "logps/rejected": -11.45885944366455, "loss": 0.3708, "rewards/accuracies": 0.84375, "rewards/chosen": -9.974831581115723, "rewards/margins": 1.4840264320373535, "rewards/rejected": -11.45885944366455, "semantic_entropy": 0.0011410152073949575, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 28.6122272486666, "learning_rate": 1.7340718539148203e-08, "logits/chosen": 0.8332939147949219, "logits/rejected": 0.8270009756088257, "logps/chosen": -9.950929641723633, "logps/rejected": -11.199871063232422, "loss": 0.4188, "rewards/accuracies": 0.8125, "rewards/chosen": -9.950929641723633, "rewards/margins": 1.2489430904388428, "rewards/rejected": -11.199871063232422, "semantic_entropy": 0.001222481718286872, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 17.901053138739112, "learning_rate": 1.6936461926568724e-08, "logits/chosen": 0.8614856600761414, "logits/rejected": 0.8957780599594116, "logps/chosen": -9.616273880004883, "logps/rejected": -11.097272872924805, "loss": 0.3995, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.616273880004883, "rewards/margins": 1.4809997081756592, "rewards/rejected": -11.097272872924805, "semantic_entropy": 0.001978642772883177, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 23.833732221773882, "learning_rate": 1.6536891998554346e-08, "logits/chosen": 0.7540593147277832, "logits/rejected": 0.8192625045776367, "logps/chosen": -9.756368637084961, "logps/rejected": -11.14604663848877, "loss": 0.3821, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.756368637084961, "rewards/margins": 1.3896772861480713, "rewards/rejected": -11.14604663848877, "semantic_entropy": 0.0011760034831240773, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 22.006338516815813, "learning_rate": 1.6142012631734093e-08, "logits/chosen": 0.8480769991874695, "logits/rejected": 0.9198252558708191, "logps/chosen": -9.712282180786133, "logps/rejected": -11.163978576660156, "loss": 0.3873, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.712282180786133, "rewards/margins": 1.4516950845718384, "rewards/rejected": -11.163978576660156, "semantic_entropy": 0.0014456122880801558, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 29.759216354153438, "learning_rate": 1.575182765722949e-08, "logits/chosen": 0.7646334171295166, "logits/rejected": 0.8091537356376648, "logps/chosen": -9.873659133911133, "logps/rejected": -11.170156478881836, "loss": 0.4221, "rewards/accuracies": 0.8125, "rewards/chosen": -9.873659133911133, "rewards/margins": 1.2964979410171509, "rewards/rejected": -11.170156478881836, "semantic_entropy": 0.001071856007911265, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.8672059774398804, "eval_logits/rejected": 0.9047586917877197, "eval_logps/chosen": -9.958077430725098, "eval_logps/rejected": -11.086060523986816, "eval_loss": 0.5239496231079102, "eval_rewards/accuracies": 0.7247774600982666, "eval_rewards/chosen": -9.958077430725098, "eval_rewards/margins": 1.1279836893081665, "eval_rewards/rejected": -11.086060523986816, "eval_runtime": 35.0763, "eval_samples_per_second": 38.345, "eval_semantic_entropy": 0.0012691987212747335, "eval_steps_per_second": 9.608, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 19.396392953978733, "learning_rate": 1.536634086061672e-08, "logits/chosen": 0.8649656176567078, "logits/rejected": 0.8771345019340515, "logps/chosen": -9.787662506103516, "logps/rejected": -11.10372543334961, "loss": 0.4402, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.787662506103516, "rewards/margins": 1.316063642501831, "rewards/rejected": -11.10372543334961, "semantic_entropy": 0.001424965332262218, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 26.94949255337376, "learning_rate": 1.4985555981890495e-08, "logits/chosen": 0.8623428344726562, "logits/rejected": 0.8985008001327515, "logps/chosen": -9.891159057617188, "logps/rejected": -11.298178672790527, "loss": 0.4101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.891159057617188, "rewards/margins": 1.4070203304290771, "rewards/rejected": -11.298178672790527, "semantic_entropy": 0.0011413523461669683, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 19.482056056108192, "learning_rate": 1.4609476715427226e-08, "logits/chosen": 0.8556788563728333, "logits/rejected": 0.8989516496658325, "logps/chosen": -9.591584205627441, "logps/rejected": -11.054668426513672, "loss": 0.3868, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.591584205627441, "rewards/margins": 1.4630842208862305, "rewards/rejected": -11.054668426513672, "semantic_entropy": 0.0015841536223888397, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 24.45269364730528, "learning_rate": 1.4238106709949792e-08, "logits/chosen": 0.7989322543144226, "logits/rejected": 0.8648680448532104, "logps/chosen": -9.795249938964844, "logps/rejected": -11.304253578186035, "loss": 0.3443, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.795249938964844, "rewards/margins": 1.5090038776397705, "rewards/rejected": -11.304253578186035, "semantic_entropy": 0.0012715930351987481, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 29.960130426736338, "learning_rate": 1.3871449568491511e-08, "logits/chosen": 0.7782562971115112, "logits/rejected": 0.8624800443649292, "logps/chosen": -9.840250968933105, "logps/rejected": -11.1703462600708, "loss": 0.4095, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.840250968933105, "rewards/margins": 1.3300951719284058, "rewards/rejected": -11.1703462600708, "semantic_entropy": 0.0011493575293570757, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 17.26742704065323, "learning_rate": 1.3509508848361606e-08, "logits/chosen": 0.7447667121887207, "logits/rejected": 0.7923721075057983, "logps/chosen": -9.699943542480469, "logps/rejected": -11.134923934936523, "loss": 0.3732, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.699943542480469, "rewards/margins": 1.4349799156188965, "rewards/rejected": -11.134923934936523, "semantic_entropy": 0.001493643270805478, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 18.96292537799569, "learning_rate": 1.3152288061110517e-08, "logits/chosen": 0.7624896764755249, "logits/rejected": 0.8235493898391724, "logps/chosen": -9.626928329467773, "logps/rejected": -11.030915260314941, "loss": 0.3852, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.626928329467773, "rewards/margins": 1.4039862155914307, "rewards/rejected": -11.030915260314941, "semantic_entropy": 0.0014617822598665953, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 21.629798181433536, "learning_rate": 1.2799790672495814e-08, "logits/chosen": 0.8226927518844604, "logits/rejected": 0.88921058177948, "logps/chosen": -9.711584091186523, "logps/rejected": -11.16446590423584, "loss": 0.3962, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.711584091186523, "rewards/margins": 1.4528809785842896, "rewards/rejected": -11.16446590423584, "semantic_entropy": 0.0014884325210005045, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 25.58854679978555, "learning_rate": 1.2452020102448835e-08, "logits/chosen": 0.844528317451477, "logits/rejected": 0.8750749826431274, "logps/chosen": -9.763383865356445, "logps/rejected": -11.062616348266602, "loss": 0.4045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.763383865356445, "rewards/margins": 1.2992339134216309, "rewards/rejected": -11.062616348266602, "semantic_entropy": 0.001275677583180368, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 30.415412249344016, "learning_rate": 1.2108979725041103e-08, "logits/chosen": 0.7932205200195312, "logits/rejected": 0.8923565745353699, "logps/chosen": -9.760113716125488, "logps/rejected": -11.208516120910645, "loss": 0.4172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.760113716125488, "rewards/margins": 1.4484022855758667, "rewards/rejected": -11.208516120910645, "semantic_entropy": 0.0014725803630426526, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 22.204446838947426, "learning_rate": 1.1770672868451958e-08, "logits/chosen": 0.8339746594429016, "logits/rejected": 0.9121615290641785, "logps/chosen": -10.030183792114258, "logps/rejected": -11.423551559448242, "loss": 0.3705, "rewards/accuracies": 0.84375, "rewards/chosen": -10.030183792114258, "rewards/margins": 1.393368124961853, "rewards/rejected": -11.423551559448242, "semantic_entropy": 0.0011341646313667297, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 36.390773136441254, "learning_rate": 1.1437102814935872e-08, "logits/chosen": 0.8048036694526672, "logits/rejected": 0.8228060603141785, "logps/chosen": -9.797411918640137, "logps/rejected": -11.064781188964844, "loss": 0.4645, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.797411918640137, "rewards/margins": 1.267369270324707, "rewards/rejected": -11.064781188964844, "semantic_entropy": 0.0013338859425857663, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 19.439863848193177, "learning_rate": 1.1108272800791018e-08, "logits/chosen": 0.8065903782844543, "logits/rejected": 0.8303533792495728, "logps/chosen": -9.820572853088379, "logps/rejected": -11.200105667114258, "loss": 0.3785, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.820572853088379, "rewards/margins": 1.3795334100723267, "rewards/rejected": -11.200105667114258, "semantic_entropy": 0.0013253279030323029, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 24.995280474358317, "learning_rate": 1.078418601632769e-08, "logits/chosen": 0.8746574521064758, "logits/rejected": 0.8969374895095825, "logps/chosen": -9.832982063293457, "logps/rejected": -11.270672798156738, "loss": 0.3779, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.832982063293457, "rewards/margins": 1.43769109249115, "rewards/rejected": -11.270672798156738, "semantic_entropy": 0.0013589839218184352, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 15.437216876674157, "learning_rate": 1.0464845605837159e-08, "logits/chosen": 0.7981137633323669, "logits/rejected": 0.8469152450561523, "logps/chosen": -9.720634460449219, "logps/rejected": -11.106169700622559, "loss": 0.3561, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.720634460449219, "rewards/margins": 1.385535478591919, "rewards/rejected": -11.106169700622559, "semantic_entropy": 0.0014761090278625488, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 15.998655549802471, "learning_rate": 1.0150254667561642e-08, "logits/chosen": 0.7983990907669067, "logits/rejected": 0.8353071212768555, "logps/chosen": -10.041610717773438, "logps/rejected": -11.562406539916992, "loss": 0.377, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -10.041610717773438, "rewards/margins": 1.5207948684692383, "rewards/rejected": -11.562406539916992, "semantic_entropy": 0.0010572883766144514, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 27.424504358400647, "learning_rate": 9.840416253663719e-09, "logits/chosen": 0.7955508232116699, "logits/rejected": 0.8736175298690796, "logps/chosen": -9.822931289672852, "logps/rejected": -11.340441703796387, "loss": 0.3779, "rewards/accuracies": 0.84375, "rewards/chosen": -9.822931289672852, "rewards/margins": 1.5175096988677979, "rewards/rejected": -11.340441703796387, "semantic_entropy": 0.0011566228931769729, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 27.591040599780438, "learning_rate": 9.535333370197074e-09, "logits/chosen": 0.8231992721557617, "logits/rejected": 0.8760835528373718, "logps/chosen": -9.819466590881348, "logps/rejected": -11.222938537597656, "loss": 0.4014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.819466590881348, "rewards/margins": 1.4034711122512817, "rewards/rejected": -11.222938537597656, "semantic_entropy": 0.0014938964741304517, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 18.470072827747238, "learning_rate": 9.23500897707713e-09, "logits/chosen": 0.8005205988883972, "logits/rejected": 0.8584259748458862, "logps/chosen": -9.931344985961914, "logps/rejected": -11.320419311523438, "loss": 0.4159, "rewards/accuracies": 0.8125, "rewards/chosen": -9.931344985961914, "rewards/margins": 1.3890745639801025, "rewards/rejected": -11.320419311523438, "semantic_entropy": 0.0011897350195795298, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 23.14759681156242, "learning_rate": 8.939445988052574e-09, "logits/chosen": 0.7812570929527283, "logits/rejected": 0.8415404558181763, "logps/chosen": -9.733675003051758, "logps/rejected": -11.219903945922852, "loss": 0.3617, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.733675003051758, "rewards/margins": 1.4862289428710938, "rewards/rejected": -11.219903945922852, "semantic_entropy": 0.001550258370116353, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 28.37485360554531, "learning_rate": 8.648647270676656e-09, "logits/chosen": 0.8305708169937134, "logits/rejected": 0.8369789123535156, "logps/chosen": -9.811845779418945, "logps/rejected": -11.167532920837402, "loss": 0.4223, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.811845779418945, "rewards/margins": 1.3556877374649048, "rewards/rejected": -11.167532920837402, "semantic_entropy": 0.0015380210243165493, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 16.333026482611224, "learning_rate": 8.362615646279991e-09, "logits/chosen": 0.8135038614273071, "logits/rejected": 0.8544967770576477, "logps/chosen": -9.836585998535156, "logps/rejected": -11.551431655883789, "loss": 0.379, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.836585998535156, "rewards/margins": 1.7148460149765015, "rewards/rejected": -11.551431655883789, "semantic_entropy": 0.0012975989375263453, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 22.628935794259295, "learning_rate": 8.081353889942466e-09, "logits/chosen": 0.8904609680175781, "logits/rejected": 0.9412251710891724, "logps/chosen": -9.89416217803955, "logps/rejected": -11.147021293640137, "loss": 0.4024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.89416217803955, "rewards/margins": 1.2528594732284546, "rewards/rejected": -11.147021293640137, "semantic_entropy": 0.001227770815603435, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 24.067990062164654, "learning_rate": 7.804864730467042e-09, "logits/chosen": 0.8739885091781616, "logits/rejected": 0.9077037572860718, "logps/chosen": -9.861922264099121, "logps/rejected": -11.29807186126709, "loss": 0.3558, "rewards/accuracies": 0.84375, "rewards/chosen": -9.861922264099121, "rewards/margins": 1.4361498355865479, "rewards/rejected": -11.29807186126709, "semantic_entropy": 0.0011800903594121337, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 20.07201861954392, "learning_rate": 7.533150850352665e-09, "logits/chosen": 0.8092811703681946, "logits/rejected": 0.8977264165878296, "logps/chosen": -9.86597728729248, "logps/rejected": -11.442625045776367, "loss": 0.3673, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.86597728729248, "rewards/margins": 1.5766479969024658, "rewards/rejected": -11.442625045776367, "semantic_entropy": 0.0013031138805672526, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 28.074967041760427, "learning_rate": 7.2662148857686175e-09, "logits/chosen": 0.8255325555801392, "logits/rejected": 0.8576027154922485, "logps/chosen": -9.927534103393555, "logps/rejected": -11.337235450744629, "loss": 0.4482, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.927534103393555, "rewards/margins": 1.4097009897232056, "rewards/rejected": -11.337235450744629, "semantic_entropy": 0.0012552501866593957, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 20.375284103879604, "learning_rate": 7.0040594265287635e-09, "logits/chosen": 0.820387065410614, "logits/rejected": 0.8405435681343079, "logps/chosen": -9.774995803833008, "logps/rejected": -10.992179870605469, "loss": 0.4593, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.774995803833008, "rewards/margins": 1.2171828746795654, "rewards/rejected": -10.992179870605469, "semantic_entropy": 0.001311628962866962, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 20.571805080727156, "learning_rate": 6.746687016066566e-09, "logits/chosen": 0.8561931848526001, "logits/rejected": 0.9219743609428406, "logps/chosen": -9.802877426147461, "logps/rejected": -11.207548141479492, "loss": 0.4, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.802877426147461, "rewards/margins": 1.404672384262085, "rewards/rejected": -11.207548141479492, "semantic_entropy": 0.0014292590785771608, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 22.031680288537032, "learning_rate": 6.494100151410276e-09, "logits/chosen": 0.7743942141532898, "logits/rejected": 0.8276403546333313, "logps/chosen": -9.865171432495117, "logps/rejected": -11.20588207244873, "loss": 0.3737, "rewards/accuracies": 0.84375, "rewards/chosen": -9.865171432495117, "rewards/margins": 1.3407100439071655, "rewards/rejected": -11.20588207244873, "semantic_entropy": 0.0012073902180418372, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 23.63055718220825, "learning_rate": 6.246301283158728e-09, "logits/chosen": 0.8746307492256165, "logits/rejected": 0.895931601524353, "logps/chosen": -9.761825561523438, "logps/rejected": -11.010538101196289, "loss": 0.473, "rewards/accuracies": 0.75, "rewards/chosen": -9.761825561523438, "rewards/margins": 1.2487126588821411, "rewards/rejected": -11.010538101196289, "semantic_entropy": 0.001414592145010829, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 20.296610881740666, "learning_rate": 6.0032928154576944e-09, "logits/chosen": 0.8312317132949829, "logits/rejected": 0.8839332461357117, "logps/chosen": -9.876100540161133, "logps/rejected": -11.113783836364746, "loss": 0.4206, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.876100540161133, "rewards/margins": 1.2376841306686401, "rewards/rejected": -11.113783836364746, "semantic_entropy": 0.0015838369727134705, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 30.87457108658666, "learning_rate": 5.76507710597629e-09, "logits/chosen": 0.8078063726425171, "logits/rejected": 0.8501046895980835, "logps/chosen": -9.82047176361084, "logps/rejected": -11.10120964050293, "loss": 0.4325, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.82047176361084, "rewards/margins": 1.2807366847991943, "rewards/rejected": -11.10120964050293, "semantic_entropy": 0.001436726888641715, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 20.020061107451614, "learning_rate": 5.531656465884438e-09, "logits/chosen": 0.7786573171615601, "logits/rejected": 0.8030532598495483, "logps/chosen": -9.810864448547363, "logps/rejected": -11.275449752807617, "loss": 0.402, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.810864448547363, "rewards/margins": 1.4645856618881226, "rewards/rejected": -11.275449752807617, "semantic_entropy": 0.0011968390317633748, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 27.208547849442688, "learning_rate": 5.303033159830217e-09, "logits/chosen": 0.8697658777236938, "logits/rejected": 0.8987666368484497, "logps/chosen": -9.942388534545898, "logps/rejected": -11.106074333190918, "loss": 0.4543, "rewards/accuracies": 0.78125, "rewards/chosen": -9.942388534545898, "rewards/margins": 1.1636863946914673, "rewards/rejected": -11.106074333190918, "semantic_entropy": 0.0013244937872514129, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 23.175199847112445, "learning_rate": 5.079209405917939e-09, "logits/chosen": 0.7859164476394653, "logits/rejected": 0.8312576413154602, "logps/chosen": -9.579522132873535, "logps/rejected": -11.244876861572266, "loss": 0.3603, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.579522132873535, "rewards/margins": 1.665353775024414, "rewards/rejected": -11.244876861572266, "semantic_entropy": 0.001594201079569757, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 19.400301117431837, "learning_rate": 4.860187375686664e-09, "logits/chosen": 0.789514422416687, "logits/rejected": 0.8606871366500854, "logps/chosen": -9.77333927154541, "logps/rejected": -11.256834983825684, "loss": 0.3748, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.77333927154541, "rewards/margins": 1.4834961891174316, "rewards/rejected": -11.256834983825684, "semantic_entropy": 0.001359016285277903, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 16.661476052195837, "learning_rate": 4.64596919408905e-09, "logits/chosen": 0.8640682101249695, "logits/rejected": 0.8913204073905945, "logps/chosen": -9.612969398498535, "logps/rejected": -11.022314071655273, "loss": 0.402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.612969398498535, "rewards/margins": 1.4093445539474487, "rewards/rejected": -11.022314071655273, "semantic_entropy": 0.0015766730066388845, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 23.611323349348925, "learning_rate": 4.436556939470814e-09, "logits/chosen": 0.7981586456298828, "logits/rejected": 0.8657184839248657, "logps/chosen": -10.07356071472168, "logps/rejected": -11.186826705932617, "loss": 0.4737, "rewards/accuracies": 0.75, "rewards/chosen": -10.07356071472168, "rewards/margins": 1.1132649183273315, "rewards/rejected": -11.186826705932617, "semantic_entropy": 0.001212230185046792, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 24.769868059281617, "learning_rate": 4.23195264355064e-09, "logits/chosen": 0.688225269317627, "logits/rejected": 0.7675420641899109, "logps/chosen": -9.678533554077148, "logps/rejected": -11.024267196655273, "loss": 0.4225, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.678533554077148, "rewards/margins": 1.3457330465316772, "rewards/rejected": -11.024267196655273, "semantic_entropy": 0.001531310030259192, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 23.697079058959545, "learning_rate": 4.032158291400245e-09, "logits/chosen": 0.7804639339447021, "logits/rejected": 0.8653789758682251, "logps/chosen": -9.64900016784668, "logps/rejected": -11.365049362182617, "loss": 0.329, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -9.64900016784668, "rewards/margins": 1.7160485982894897, "rewards/rejected": -11.365049362182617, "semantic_entropy": 0.0016292607178911567, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 20.18703093788473, "learning_rate": 3.837175821425398e-09, "logits/chosen": 0.8109928369522095, "logits/rejected": 0.8580228686332703, "logps/chosen": -9.768526077270508, "logps/rejected": -11.133955001831055, "loss": 0.4179, "rewards/accuracies": 0.8125, "rewards/chosen": -9.768526077270508, "rewards/margins": 1.3654298782348633, "rewards/rejected": -11.133955001831055, "semantic_entropy": 0.0016449004178866744, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 14.944854500276783, "learning_rate": 3.6470071253467683e-09, "logits/chosen": 0.8247249722480774, "logits/rejected": 0.8406414985656738, "logps/chosen": -9.951112747192383, "logps/rejected": -11.39813232421875, "loss": 0.4228, "rewards/accuracies": 0.78125, "rewards/chosen": -9.951112747192383, "rewards/margins": 1.4470199346542358, "rewards/rejected": -11.39813232421875, "semantic_entropy": 0.001148298499174416, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 17.13225055698541, "learning_rate": 3.461654048181939e-09, "logits/chosen": 0.810439944267273, "logits/rejected": 0.904525637626648, "logps/chosen": -10.016260147094727, "logps/rejected": -11.255754470825195, "loss": 0.4262, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.016260147094727, "rewards/margins": 1.2394943237304688, "rewards/rejected": -11.255754470825195, "semantic_entropy": 0.0010481254430487752, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 21.19682630785255, "learning_rate": 3.281118388227255e-09, "logits/chosen": 0.8494071960449219, "logits/rejected": 0.8823550343513489, "logps/chosen": -9.834370613098145, "logps/rejected": -11.027946472167969, "loss": 0.4672, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.834370613098145, "rewards/margins": 1.193576693534851, "rewards/rejected": -11.027946472167969, "semantic_entropy": 0.001247903099283576, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 25.992370109808167, "learning_rate": 3.1054018970405048e-09, "logits/chosen": 0.8348292112350464, "logits/rejected": 0.8578389286994934, "logps/chosen": -9.816828727722168, "logps/rejected": -11.25603199005127, "loss": 0.4049, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.816828727722168, "rewards/margins": 1.4392026662826538, "rewards/rejected": -11.25603199005127, "semantic_entropy": 0.0012755084317177534, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 22.319154102335233, "learning_rate": 2.9345062794238207e-09, "logits/chosen": 0.8351479768753052, "logits/rejected": 0.9166293144226074, "logps/chosen": -9.844882011413574, "logps/rejected": -11.379692077636719, "loss": 0.35, "rewards/accuracies": 0.875, "rewards/chosen": -9.844882011413574, "rewards/margins": 1.5348093509674072, "rewards/rejected": -11.379692077636719, "semantic_entropy": 0.0016486002132296562, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 19.717507438885043, "learning_rate": 2.7684331934072492e-09, "logits/chosen": 0.7874764204025269, "logits/rejected": 0.8273738026618958, "logps/chosen": -9.670753479003906, "logps/rejected": -11.1673583984375, "loss": 0.3777, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.670753479003906, "rewards/margins": 1.4966033697128296, "rewards/rejected": -11.1673583984375, "semantic_entropy": 0.001804637722671032, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 20.47002458984704, "learning_rate": 2.6071842502326526e-09, "logits/chosen": 0.8280852437019348, "logits/rejected": 0.8752776980400085, "logps/chosen": -9.854182243347168, "logps/rejected": -11.047213554382324, "loss": 0.4253, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.854182243347168, "rewards/margins": 1.1930307149887085, "rewards/rejected": -11.047213554382324, "semantic_entropy": 0.0011610215296968818, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 27.24590899012989, "learning_rate": 2.450761014337888e-09, "logits/chosen": 0.8899133801460266, "logits/rejected": 0.9149841070175171, "logps/chosen": -9.686140060424805, "logps/rejected": -11.125383377075195, "loss": 0.4581, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.686140060424805, "rewards/margins": 1.4392426013946533, "rewards/rejected": -11.125383377075195, "semantic_entropy": 0.0013887417735531926, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 30.201566273141413, "learning_rate": 2.299165003341985e-09, "logits/chosen": 0.8797470331192017, "logits/rejected": 0.9165847897529602, "logps/chosen": -9.838947296142578, "logps/rejected": -11.165335655212402, "loss": 0.4477, "rewards/accuracies": 0.78125, "rewards/chosen": -9.838947296142578, "rewards/margins": 1.3263883590698242, "rewards/rejected": -11.165335655212402, "semantic_entropy": 0.0014228606596589088, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 21.66371324186062, "learning_rate": 2.1523976880299945e-09, "logits/chosen": 0.7495226263999939, "logits/rejected": 0.8447777032852173, "logps/chosen": -9.877795219421387, "logps/rejected": -10.990615844726562, "loss": 0.4624, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.877795219421387, "rewards/margins": 1.1128205060958862, "rewards/rejected": -10.990615844726562, "semantic_entropy": 0.001258770003914833, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 19.131164191986134, "learning_rate": 2.010460492339161e-09, "logits/chosen": 0.7976396083831787, "logits/rejected": 0.8642821311950684, "logps/chosen": -9.621539115905762, "logps/rejected": -11.064565658569336, "loss": 0.3859, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.621539115905762, "rewards/margins": 1.4430257081985474, "rewards/rejected": -11.064565658569336, "semantic_entropy": 0.0015615615993738174, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 16.998983409706852, "learning_rate": 1.8733547933446614e-09, "logits/chosen": 0.8069744110107422, "logits/rejected": 0.8920931816101074, "logps/chosen": -9.918710708618164, "logps/rejected": -11.107327461242676, "loss": 0.4295, "rewards/accuracies": 0.78125, "rewards/chosen": -9.918710708618164, "rewards/margins": 1.1886180639266968, "rewards/rejected": -11.107327461242676, "semantic_entropy": 0.001179686514660716, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 34.20627907884556, "learning_rate": 1.7410819212467231e-09, "logits/chosen": 0.8224443197250366, "logits/rejected": 0.8717595338821411, "logps/chosen": -9.855988502502441, "logps/rejected": -11.113961219787598, "loss": 0.4348, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.855988502502441, "rewards/margins": 1.2579724788665771, "rewards/rejected": -11.113961219787598, "semantic_entropy": 0.0013345398474484682, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 21.07119537810863, "learning_rate": 1.613643159357192e-09, "logits/chosen": 0.8732544183731079, "logits/rejected": 0.8522858619689941, "logps/chosen": -9.735010147094727, "logps/rejected": -10.999938011169434, "loss": 0.4045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.735010147094727, "rewards/margins": 1.2649286985397339, "rewards/rejected": -10.999938011169434, "semantic_entropy": 0.0016018247697502375, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 22.6554353032534, "learning_rate": 1.4910397440875967e-09, "logits/chosen": 0.795121431350708, "logits/rejected": 0.8548393249511719, "logps/chosen": -9.826266288757324, "logps/rejected": -11.189390182495117, "loss": 0.4163, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.826266288757324, "rewards/margins": 1.363124132156372, "rewards/rejected": -11.189390182495117, "semantic_entropy": 0.0013476324966177344, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 25.149551686278766, "learning_rate": 1.3732728649368253e-09, "logits/chosen": 0.8276500701904297, "logits/rejected": 0.8823320269584656, "logps/chosen": -9.684768676757812, "logps/rejected": -10.858797073364258, "loss": 0.4437, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.684768676757812, "rewards/margins": 1.1740278005599976, "rewards/rejected": -10.858797073364258, "semantic_entropy": 0.0018609801772981882, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 25.88498942606627, "learning_rate": 1.260343664479524e-09, "logits/chosen": 0.7547510862350464, "logits/rejected": 0.7992917895317078, "logps/chosen": -9.716946601867676, "logps/rejected": -10.954937934875488, "loss": 0.4331, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.716946601867676, "rewards/margins": 1.2379915714263916, "rewards/rejected": -10.954937934875488, "semantic_entropy": 0.0013116684276610613, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 18.9946324561305, "learning_rate": 1.1522532383554384e-09, "logits/chosen": 0.8538810014724731, "logits/rejected": 0.9081370234489441, "logps/chosen": -9.737415313720703, "logps/rejected": -11.309637069702148, "loss": 0.3575, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.737415313720703, "rewards/margins": 1.5722216367721558, "rewards/rejected": -11.309637069702148, "semantic_entropy": 0.0014735187869518995, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 18.123811698473638, "learning_rate": 1.049002635258256e-09, "logits/chosen": 0.8666974902153015, "logits/rejected": 0.9071024656295776, "logps/chosen": -9.86131763458252, "logps/rejected": -11.140911102294922, "loss": 0.4182, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.86131763458252, "rewards/margins": 1.2795933485031128, "rewards/rejected": -11.140911102294922, "semantic_entropy": 0.0012588893296197057, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 28.867054705404758, "learning_rate": 9.505928569258358e-10, "logits/chosen": 0.828734278678894, "logits/rejected": 0.8480997085571289, "logps/chosen": -9.789422988891602, "logps/rejected": -11.044143676757812, "loss": 0.4458, "rewards/accuracies": 0.78125, "rewards/chosen": -9.789422988891602, "rewards/margins": 1.2547214031219482, "rewards/rejected": -11.044143676757812, "semantic_entropy": 0.0014515508664771914, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 20.361191522004823, "learning_rate": 8.57024858130273e-10, "logits/chosen": 0.8103793859481812, "logits/rejected": 0.8867511749267578, "logps/chosen": -9.793035507202148, "logps/rejected": -11.505876541137695, "loss": 0.3433, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.793035507202148, "rewards/margins": 1.7128407955169678, "rewards/rejected": -11.505876541137695, "semantic_entropy": 0.0011786060640588403, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 25.4932234461195, "learning_rate": 7.682995466686826e-10, "logits/chosen": 0.7820402383804321, "logits/rejected": 0.8293735384941101, "logps/chosen": -9.81843376159668, "logps/rejected": -11.206514358520508, "loss": 0.4035, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.81843376159668, "rewards/margins": 1.3880798816680908, "rewards/rejected": -11.206514358520508, "semantic_entropy": 0.001364008872769773, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 22.786471299719448, "learning_rate": 6.844177833543741e-10, "logits/chosen": 0.8798080682754517, "logits/rejected": 0.8902498483657837, "logps/chosen": -9.707275390625, "logps/rejected": -11.065264701843262, "loss": 0.3739, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.707275390625, "rewards/margins": 1.3579896688461304, "rewards/rejected": -11.065264701843262, "semantic_entropy": 0.0014018730726093054, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 24.344487883516077, "learning_rate": 6.053803820087467e-10, "logits/chosen": 0.8397024273872375, "logits/rejected": 0.9295798540115356, "logps/chosen": -9.954577445983887, "logps/rejected": -11.368110656738281, "loss": 0.4164, "rewards/accuracies": 0.8125, "rewards/chosen": -9.954577445983887, "rewards/margins": 1.4135328531265259, "rewards/rejected": -11.368110656738281, "semantic_entropy": 0.0010031659621745348, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 22.61990267466979, "learning_rate": 5.311881094528514e-10, "logits/chosen": 0.8053072094917297, "logits/rejected": 0.8756014108657837, "logps/chosen": -10.003788948059082, "logps/rejected": -11.16923713684082, "loss": 0.4491, "rewards/accuracies": 0.75, "rewards/chosen": -10.003788948059082, "rewards/margins": 1.1654479503631592, "rewards/rejected": -11.16923713684082, "semantic_entropy": 0.001166566857136786, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 25.594286670666698, "learning_rate": 4.6184168550050806e-10, "logits/chosen": 0.8106497526168823, "logits/rejected": 0.8661069869995117, "logps/chosen": -9.88862419128418, "logps/rejected": -11.237882614135742, "loss": 0.4061, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.88862419128418, "rewards/margins": 1.3492584228515625, "rewards/rejected": -11.237882614135742, "semantic_entropy": 0.0012052215170115232, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 22.658593939455915, "learning_rate": 3.973417829510328e-10, "logits/chosen": 0.7906460762023926, "logits/rejected": 0.8491800427436829, "logps/chosen": -9.941095352172852, "logps/rejected": -11.274066925048828, "loss": 0.4179, "rewards/accuracies": 0.8125, "rewards/chosen": -9.941095352172852, "rewards/margins": 1.332972526550293, "rewards/rejected": -11.274066925048828, "semantic_entropy": 0.0011057687224820256, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 22.045194562461553, "learning_rate": 3.3768902758274377e-10, "logits/chosen": 0.847141444683075, "logits/rejected": 0.885520339012146, "logps/chosen": -9.864678382873535, "logps/rejected": -11.155853271484375, "loss": 0.4141, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -9.864678382873535, "rewards/margins": 1.2911745309829712, "rewards/rejected": -11.155853271484375, "semantic_entropy": 0.0010961454827338457, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 16.922925357956778, "learning_rate": 2.8288399814691e-10, "logits/chosen": 0.8648706674575806, "logits/rejected": 0.9044130444526672, "logps/chosen": -9.688464164733887, "logps/rejected": -10.932621955871582, "loss": 0.4164, "rewards/accuracies": 0.8125, "rewards/chosen": -9.688464164733887, "rewards/margins": 1.2441574335098267, "rewards/rejected": -10.932621955871582, "semantic_entropy": 0.0013283784501254559, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 25.775353395913132, "learning_rate": 2.3292722636220066e-10, "logits/chosen": 0.7877558469772339, "logits/rejected": 0.8715565800666809, "logps/chosen": -9.736814498901367, "logps/rejected": -11.426549911499023, "loss": 0.3466, "rewards/accuracies": 0.84375, "rewards/chosen": -9.736814498901367, "rewards/margins": 1.6897351741790771, "rewards/rejected": -11.426549911499023, "semantic_entropy": 0.001374770887196064, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 24.401341154573725, "learning_rate": 1.8781919690946668e-10, "logits/chosen": 0.7978461980819702, "logits/rejected": 0.8458053469657898, "logps/chosen": -9.92179012298584, "logps/rejected": -11.083666801452637, "loss": 0.4448, "rewards/accuracies": 0.8125, "rewards/chosen": -9.92179012298584, "rewards/margins": 1.1618760824203491, "rewards/rejected": -11.083666801452637, "semantic_entropy": 0.001205159118399024, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 23.987542667982627, "learning_rate": 1.4756034742696711e-10, "logits/chosen": 0.8300431370735168, "logits/rejected": 0.9039738774299622, "logps/chosen": -9.867820739746094, "logps/rejected": -11.240914344787598, "loss": 0.4123, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.867820739746094, "rewards/margins": 1.3730926513671875, "rewards/rejected": -11.240914344787598, "semantic_entropy": 0.0011399075156077743, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 20.018421831607807, "learning_rate": 1.12151068506261e-10, "logits/chosen": 0.8630874752998352, "logits/rejected": 0.9077743291854858, "logps/chosen": -9.715357780456543, "logps/rejected": -11.356972694396973, "loss": 0.3585, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.715357780456543, "rewards/margins": 1.6416149139404297, "rewards/rejected": -11.356972694396973, "semantic_entropy": 0.0017618630081415176, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 19.995404454240774, "learning_rate": 8.159170368826629e-11, "logits/chosen": 0.8361412882804871, "logits/rejected": 0.8893247842788696, "logps/chosen": -9.477738380432129, "logps/rejected": -10.875368118286133, "loss": 0.4263, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.477738380432129, "rewards/margins": 1.3976287841796875, "rewards/rejected": -10.875368118286133, "semantic_entropy": 0.001689505996182561, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 25.119002068756046, "learning_rate": 5.588254946015114e-11, "logits/chosen": 0.8052582740783691, "logits/rejected": 0.8892769813537598, "logps/chosen": -9.728483200073242, "logps/rejected": -11.229433059692383, "loss": 0.3915, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -9.728483200073242, "rewards/margins": 1.5009489059448242, "rewards/rejected": -11.229433059692383, "semantic_entropy": 0.0017851864686235785, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 17.38170767201798, "learning_rate": 3.502385525216978e-11, "logits/chosen": 0.7621601819992065, "logits/rejected": 0.8389317393302917, "logps/chosen": -9.730030059814453, "logps/rejected": -11.127847671508789, "loss": 0.3831, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.730030059814453, "rewards/margins": 1.3978168964385986, "rewards/rejected": -11.127847671508789, "semantic_entropy": 0.0015055348630994558, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 18.851089955324714, "learning_rate": 1.901582343555308e-11, "logits/chosen": 0.8362342119216919, "logits/rejected": 0.894599437713623, "logps/chosen": -9.942944526672363, "logps/rejected": -11.178738594055176, "loss": 0.4489, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.942944526672363, "rewards/margins": 1.2357933521270752, "rewards/rejected": -11.178738594055176, "semantic_entropy": 0.001227195025421679, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 33.481853049291296, "learning_rate": 7.858609320232634e-12, "logits/chosen": 0.8312174677848816, "logits/rejected": 0.907203197479248, "logps/chosen": -9.739818572998047, "logps/rejected": -11.097038269042969, "loss": 0.435, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.739818572998047, "rewards/margins": 1.3572200536727905, "rewards/rejected": -11.097038269042969, "semantic_entropy": 0.0013708441983908415, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 32.77365513835394, "learning_rate": 1.5523211535639624e-12, "logits/chosen": 0.8438766598701477, "logits/rejected": 0.876266360282898, "logps/chosen": -9.731843948364258, "logps/rejected": -11.251733779907227, "loss": 0.4023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.731843948364258, "rewards/margins": 1.5198904275894165, "rewards/rejected": -11.251733779907227, "semantic_entropy": 0.001664994633756578, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.9163224101066589, "eval_logits/rejected": 0.9576993584632874, "eval_logps/chosen": -9.975739479064941, "eval_logps/rejected": -11.105352401733398, "eval_loss": 0.52450031042099, "eval_rewards/accuracies": 0.7240356206893921, "eval_rewards/chosen": -9.975739479064941, "eval_rewards/margins": 1.1296132802963257, "eval_rewards/rejected": -11.105352401733398, "eval_runtime": 35.057, "eval_samples_per_second": 38.366, "eval_semantic_entropy": 0.0012647128896787763, "eval_steps_per_second": 9.613, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.5450739759491819, "train_runtime": 29046.9509, "train_samples_per_second": 6.175, "train_steps_per_second": 0.193 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }