{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9957215091404122, "eval_steps": 64, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002074419810709192, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5625, "grad_norm": 30.445291710986226, "learning_rate": 0.0, "logits/chosen": 1.3143655061721802, "logits/rejected": 1.334812045097351, "logps/accuracies": 0.4375, "logps/chosen": -329.3199157714844, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -329.3199157714844, "logps/ref_rejected": -308.284912109375, "logps/rejected": -308.284912109375, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/grad_term": 0.05000000447034836, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004148839621418384, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.4375, "grad_norm": 31.539643174909184, "learning_rate": 1.5151715240963886e-07, "logits/chosen": 1.136220932006836, "logits/rejected": 1.1561778783798218, "logps/accuracies": 0.5625, "logps/chosen": -280.4060363769531, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -280.13531494140625, "logps/ref_rejected": -287.2406005859375, "logps/rejected": -287.34637451171875, "loss": 0.9925, "rewards/accuracies": 0.375, "rewards/chosen": -0.027070851996541023, "rewards/grad_term": 0.05042332783341408, "rewards/margins": -0.01649157702922821, "rewards/rejected": -0.010579276829957962, "step": 2 }, { "epoch": 0.006223259432127577, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.6875, "grad_norm": 35.669754591330424, "learning_rate": 2.401490047853298e-07, "logits/chosen": 1.6139692068099976, "logits/rejected": 1.5537246465682983, "logps/accuracies": 0.3125, "logps/chosen": -279.83502197265625, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -279.268310546875, "logps/ref_rejected": -258.850341796875, "logps/rejected": -259.1755065917969, "loss": 0.9854, "rewards/accuracies": 0.3125, "rewards/chosen": -0.056671928614377975, "rewards/grad_term": 0.050606753677129745, "rewards/margins": -0.024156270548701286, "rewards/rejected": -0.03251565620303154, "step": 3 }, { "epoch": 0.008297679242836769, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5625, "grad_norm": 35.57252658535935, "learning_rate": 3.030343048192777e-07, "logits/chosen": 1.7025470733642578, "logits/rejected": 1.6247684955596924, "logps/accuracies": 0.4375, "logps/chosen": -321.3578796386719, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -321.1121826171875, "logps/ref_rejected": -316.1632995605469, "logps/rejected": -316.3089599609375, "loss": 0.9885, "rewards/accuracies": 0.5625, "rewards/chosen": -0.024570418521761894, "rewards/grad_term": 0.050244007259607315, "rewards/margins": -0.010007334873080254, "rewards/rejected": -0.014563081786036491, "step": 4 }, { "epoch": 0.010372099053545962, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.375, "grad_norm": 40.58061807061268, "learning_rate": 3.5181193303727093e-07, "logits/chosen": 1.4226707220077515, "logits/rejected": 1.505796194076538, "logps/accuracies": 0.625, "logps/chosen": -245.84014892578125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -245.4801483154297, "logps/ref_rejected": -251.10232543945312, "logps/rejected": -251.73736572265625, "loss": 0.9812, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03600040823221207, "rewards/grad_term": 0.049336254596710205, "rewards/margins": 0.02750583179295063, "rewards/rejected": -0.06350623816251755, "step": 5 }, { "epoch": 0.012446518864255154, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.3125, "grad_norm": 27.99149170411343, "learning_rate": 3.9166615719496866e-07, "logits/chosen": 1.3876930475234985, "logits/rejected": 1.4264953136444092, "logps/accuracies": 0.6875, "logps/chosen": -291.3964538574219, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -291.29791259765625, "logps/ref_rejected": -307.63433837890625, "logps/rejected": -309.51678466796875, "loss": 0.9803, "rewards/accuracies": 0.625, "rewards/chosen": -0.009854471310973167, "rewards/grad_term": 0.04629334807395935, "rewards/margins": 0.17839080095291138, "rewards/rejected": -0.1882452815771103, "step": 6 }, { "epoch": 0.014520938674964345, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.625, "grad_norm": 35.07512813691069, "learning_rate": 4.253624235933518e-07, "logits/chosen": 1.3303760290145874, "logits/rejected": 1.41872239112854, "logps/accuracies": 0.375, "logps/chosen": -256.7828369140625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -256.71258544921875, "logps/ref_rejected": -252.4455108642578, "logps/rejected": -255.78366088867188, "loss": 0.9451, "rewards/accuracies": 0.8125, "rewards/chosen": -0.007027318701148033, "rewards/grad_term": 0.04277125000953674, "rewards/margins": 0.3267865777015686, "rewards/rejected": -0.3338139057159424, "step": 7 }, { "epoch": 0.016595358485673537, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.75, "grad_norm": 53.4690693475631, "learning_rate": 4.545514572289166e-07, "logits/chosen": 1.4601320028305054, "logits/rejected": 1.5025103092193604, "logps/accuracies": 0.25, "logps/chosen": -328.91387939453125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -328.650634765625, "logps/ref_rejected": -320.340576171875, "logps/rejected": -322.7126159667969, "loss": 0.9334, "rewards/accuracies": 0.5, "rewards/chosen": -0.026328086853027344, "rewards/grad_term": 0.04583045467734337, "rewards/margins": 0.2108786404132843, "rewards/rejected": -0.23720674216747284, "step": 8 }, { "epoch": 0.01866977829638273, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.375, "grad_norm": 50.206274648941296, "learning_rate": 4.802980095706596e-07, "logits/chosen": 1.516817331314087, "logits/rejected": 1.5115327835083008, "logps/accuracies": 0.625, "logps/chosen": -271.8262634277344, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -272.4455261230469, "logps/ref_rejected": -264.29541015625, "logps/rejected": -272.98419189453125, "loss": 0.9138, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0619237944483757, "rewards/grad_term": 0.037300530821084976, "rewards/margins": 0.930805504322052, "rewards/rejected": -0.8688817620277405, "step": 9 }, { "epoch": 0.020744198107091924, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.4375, "grad_norm": 68.87775211865134, "learning_rate": 5.033290854469099e-07, "logits/chosen": 1.1356251239776611, "logits/rejected": 1.1563141345977783, "logps/accuracies": 0.5625, "logps/chosen": -303.23114013671875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -303.0604553222656, "logps/ref_rejected": -303.90673828125, "logps/rejected": -306.86865234375, "loss": 0.9189, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017069505527615547, "rewards/grad_term": 0.04398500546813011, "rewards/margins": 0.27912360429763794, "rewards/rejected": -0.29619312286376953, "step": 10 }, { "epoch": 0.022818617917801116, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 57.71463843312805, "learning_rate": 5.241632278117911e-07, "logits/chosen": 1.4097551107406616, "logits/rejected": 1.5242267847061157, "logps/accuracies": 0.6875, "logps/chosen": -328.52459716796875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -328.664306640625, "logps/ref_rejected": -354.8746032714844, "logps/rejected": -370.66998291015625, "loss": 0.874, "rewards/accuracies": 0.9375, "rewards/chosen": 0.013973474502563477, "rewards/grad_term": 0.029512763023376465, "rewards/margins": 1.5935115814208984, "rewards/rejected": -1.579538106918335, "step": 11 }, { "epoch": 0.024893037728510307, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 19.34405027942017, "learning_rate": 5.431833096046075e-07, "logits/chosen": 1.2825186252593994, "logits/rejected": 1.4041016101837158, "logps/accuracies": 0.6875, "logps/chosen": -339.60736083984375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -332.350341796875, "logps/ref_rejected": -342.4404602050781, "logps/rejected": -393.659423828125, "loss": 0.7647, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7256991863250732, "rewards/grad_term": 0.022119037806987762, "rewards/margins": 4.3961944580078125, "rewards/rejected": -5.121893405914307, "step": 12 }, { "epoch": 0.0269674575392195, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.375, "grad_norm": 18.068273844723095, "learning_rate": 5.606800887562651e-07, "logits/chosen": 1.3912986516952515, "logits/rejected": 1.4100464582443237, "logps/accuracies": 0.625, "logps/chosen": -360.9014587402344, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -349.53863525390625, "logps/ref_rejected": -341.5389709472656, "logps/rejected": -384.8885498046875, "loss": 0.7998, "rewards/accuracies": 0.875, "rewards/chosen": -1.136283040046692, "rewards/grad_term": 0.016487201675772667, "rewards/margins": 3.198676347732544, "rewards/rejected": -4.334959506988525, "step": 13 }, { "epoch": 0.02904187734992869, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 27.645843878628447, "learning_rate": 5.768795760029907e-07, "logits/chosen": 1.4637022018432617, "logits/rejected": 1.4576082229614258, "logps/accuracies": 0.5625, "logps/chosen": -319.2772216796875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -297.5706787109375, "logps/ref_rejected": -267.2304382324219, "logps/rejected": -324.9382629394531, "loss": 0.7958, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1706531047821045, "rewards/grad_term": 0.016141919419169426, "rewards/margins": 3.600131034851074, "rewards/rejected": -5.770784378051758, "step": 14 }, { "epoch": 0.031116297160637883, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 24.62720014572735, "learning_rate": 5.919609378226007e-07, "logits/chosen": 1.4202252626419067, "logits/rejected": 1.5137040615081787, "logps/accuracies": 0.8125, "logps/chosen": -354.2460632324219, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -320.8343200683594, "logps/ref_rejected": -324.75347900390625, "logps/rejected": -385.1374816894531, "loss": 0.8253, "rewards/accuracies": 0.875, "rewards/chosen": -3.341172933578491, "rewards/grad_term": 0.024179620668292046, "rewards/margins": 2.697230815887451, "rewards/rejected": -6.0384039878845215, "step": 15 }, { "epoch": 0.033190716971347074, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 24.048376194470723, "learning_rate": 6.060686096385554e-07, "logits/chosen": 1.3625872135162354, "logits/rejected": 1.5674644708633423, "logps/accuracies": 0.75, "logps/chosen": -321.19952392578125, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -300.4372253417969, "logps/ref_rejected": -299.97760009765625, "logps/rejected": -369.1448974609375, "loss": 0.7917, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0762338638305664, "rewards/grad_term": 0.01863047480583191, "rewards/margins": 4.8404951095581055, "rewards/rejected": -6.916728973388672, "step": 16 }, { "epoch": 0.03526513678205627, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 45.28780856236718, "learning_rate": 6.193207302864632e-07, "logits/chosen": 1.3500301837921143, "logits/rejected": 1.3641669750213623, "logps/accuracies": 0.75, "logps/chosen": -257.4446105957031, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -241.6280517578125, "logps/ref_rejected": -237.22329711914062, "logps/rejected": -294.7928466796875, "loss": 0.7765, "rewards/accuracies": 0.875, "rewards/chosen": -1.5816560983657837, "rewards/grad_term": 0.019907817244529724, "rewards/margins": 4.175297737121582, "rewards/rejected": -5.756953239440918, "step": 17 }, { "epoch": 0.03733955659276546, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 33.38191088083146, "learning_rate": 6.318151619802984e-07, "logits/chosen": 1.1816256046295166, "logits/rejected": 1.2595932483673096, "logps/accuracies": 0.75, "logps/chosen": -317.13775634765625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -302.21380615234375, "logps/ref_rejected": -337.5311279296875, "logps/rejected": -381.22796630859375, "loss": 0.87, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4923958778381348, "rewards/grad_term": 0.024127114564180374, "rewards/margins": 2.877284288406372, "rewards/rejected": -4.369679927825928, "step": 18 }, { "epoch": 0.03941397640347465, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 40.949935957102895, "learning_rate": 6.436338804795301e-07, "logits/chosen": 1.4050649404525757, "logits/rejected": 1.4994277954101562, "logps/accuracies": 0.875, "logps/chosen": -292.9769287109375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -263.2729187011719, "logps/ref_rejected": -291.3985290527344, "logps/rejected": -359.13031005859375, "loss": 0.8089, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9704017639160156, "rewards/grad_term": 0.02432025596499443, "rewards/margins": 3.802779197692871, "rewards/rejected": -6.773180961608887, "step": 19 }, { "epoch": 0.04148839621418385, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 30.4272452916532, "learning_rate": 6.548462378565487e-07, "logits/chosen": 1.5738377571105957, "logits/rejected": 1.5692741870880127, "logps/accuracies": 0.8125, "logps/chosen": -281.9122619628906, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -266.5782775878906, "logps/ref_rejected": -265.3857421875, "logps/rejected": -338.1681213378906, "loss": 0.8287, "rewards/accuracies": 0.875, "rewards/chosen": -1.5333983898162842, "rewards/grad_term": 0.01711239479482174, "rewards/margins": 5.744836807250977, "rewards/rejected": -7.278235912322998, "step": 20 }, { "epoch": 0.043562816024893036, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.375, "grad_norm": 21.481613213914592, "learning_rate": 6.655114283786817e-07, "logits/chosen": 1.495798945426941, "logits/rejected": 1.575231909751892, "logps/accuracies": 0.5625, "logps/chosen": -329.2530517578125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -307.7621154785156, "logps/ref_rejected": -305.4177551269531, "logps/rejected": -353.7918395996094, "loss": 0.7972, "rewards/accuracies": 0.75, "rewards/chosen": -2.1490931510925293, "rewards/grad_term": 0.026765087619423866, "rewards/margins": 2.6883203983306885, "rewards/rejected": -4.8374128341674805, "step": 21 }, { "epoch": 0.04563723583560223, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 43.410701214740534, "learning_rate": 6.7568038022143e-07, "logits/chosen": 1.3690290451049805, "logits/rejected": 1.4588748216629028, "logps/accuracies": 0.75, "logps/chosen": -303.6971740722656, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -295.21514892578125, "logps/ref_rejected": -289.0858459472656, "logps/rejected": -341.2356262207031, "loss": 0.756, "rewards/accuracies": 0.875, "rewards/chosen": -0.8482051491737366, "rewards/grad_term": 0.01651611179113388, "rewards/margins": 4.366776466369629, "rewards/rejected": -5.214981555938721, "step": 22 }, { "epoch": 0.04771165564631142, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.375, "grad_norm": 40.534862406872996, "learning_rate": 6.853972263303346e-07, "logits/chosen": 1.460001826286316, "logits/rejected": 1.4567062854766846, "logps/accuracies": 0.625, "logps/chosen": -357.6313171386719, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -350.6695251464844, "logps/ref_rejected": -331.9695129394531, "logps/rejected": -388.808349609375, "loss": 0.7408, "rewards/accuracies": 0.875, "rewards/chosen": -0.6961804628372192, "rewards/grad_term": 0.018778638914227486, "rewards/margins": 4.987700939178467, "rewards/rejected": -5.683881759643555, "step": 23 }, { "epoch": 0.049786075457020615, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 25.219410545582974, "learning_rate": 6.947004620142464e-07, "logits/chosen": 1.494457483291626, "logits/rejected": 1.583849310874939, "logps/accuracies": 0.8125, "logps/chosen": -303.1878662109375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -287.615234375, "logps/ref_rejected": -305.63421630859375, "logps/rejected": -363.3654479980469, "loss": 0.741, "rewards/accuracies": 0.875, "rewards/chosen": -1.5572607517242432, "rewards/grad_term": 0.01411459967494011, "rewards/margins": 4.215861797332764, "rewards/rejected": -5.773122787475586, "step": 24 }, { "epoch": 0.05186049526772981, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 21.34830172085934, "learning_rate": 7.036238660745419e-07, "logits/chosen": 1.2966912984848022, "logits/rejected": 1.3436973094940186, "logps/accuracies": 0.8125, "logps/chosen": -318.70855712890625, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -309.7275390625, "logps/ref_rejected": -321.34222412109375, "logps/rejected": -374.23724365234375, "loss": 0.7473, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8981032371520996, "rewards/grad_term": 0.030619274824857712, "rewards/margins": 4.391395092010498, "rewards/rejected": -5.289497375488281, "step": 25 }, { "epoch": 0.053934915078439, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 24.70213159620223, "learning_rate": 7.121972411659039e-07, "logits/chosen": 1.527209997177124, "logits/rejected": 1.5188902616500854, "logps/accuracies": 0.75, "logps/chosen": -311.9977722167969, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -312.7909240722656, "logps/ref_rejected": -319.8678283691406, "logps/rejected": -361.7353515625, "loss": 0.7263, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07931968569755554, "rewards/grad_term": 0.0128245297819376, "rewards/margins": 4.26607084274292, "rewards/rejected": -4.186751365661621, "step": 26 }, { "epoch": 0.056009334889148193, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.375, "grad_norm": 21.657435932051584, "learning_rate": 7.204470143559894e-07, "logits/chosen": 1.0803958177566528, "logits/rejected": 1.182570219039917, "logps/accuracies": 0.625, "logps/chosen": -305.4505310058594, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -292.47894287109375, "logps/ref_rejected": -291.8714904785156, "logps/rejected": -341.31640625, "loss": 0.708, "rewards/accuracies": 0.8125, "rewards/chosen": -1.29715895652771, "rewards/grad_term": 0.02386583387851715, "rewards/margins": 3.6473331451416016, "rewards/rejected": -4.944491863250732, "step": 27 }, { "epoch": 0.05808375469985738, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.375, "grad_norm": 43.751484729145034, "learning_rate": 7.283967284126295e-07, "logits/chosen": 1.5686805248260498, "logits/rejected": 1.6045427322387695, "logps/accuracies": 0.5625, "logps/chosen": -274.0627136230469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -275.16888427734375, "logps/ref_rejected": -261.067138671875, "logps/rejected": -293.029052734375, "loss": 0.7257, "rewards/accuracies": 0.875, "rewards/chosen": 0.11061999201774597, "rewards/grad_term": 0.01599438488483429, "rewards/margins": 3.306811571121216, "rewards/rejected": -3.1961915493011475, "step": 28 }, { "epoch": 0.06015817451056658, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.3125, "grad_norm": 31.4830012847814, "learning_rate": 7.360674468418735e-07, "logits/chosen": 1.3757277727127075, "logits/rejected": 1.405045747756958, "logps/accuracies": 0.6875, "logps/chosen": -325.8724365234375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -329.0152893066406, "logps/ref_rejected": -316.9151611328125, "logps/rejected": -348.4771423339844, "loss": 0.7116, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3142804205417633, "rewards/grad_term": 0.01970498077571392, "rewards/margins": 3.47047758102417, "rewards/rejected": -3.1561975479125977, "step": 29 }, { "epoch": 0.062232594321275765, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 43.903525037448524, "learning_rate": 7.434780902322396e-07, "logits/chosen": 1.2318979501724243, "logits/rejected": 1.2543270587921143, "logps/accuracies": 0.9375, "logps/chosen": -299.3183288574219, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -293.3425598144531, "logps/ref_rejected": -326.46270751953125, "logps/rejected": -357.6190185546875, "loss": 0.714, "rewards/accuracies": 0.875, "rewards/chosen": -0.5975769758224487, "rewards/grad_term": 0.024707140401005745, "rewards/margins": 2.5180513858795166, "rewards/rejected": -3.115628242492676, "step": 30 }, { "epoch": 0.06430701413198496, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 34.75148622057771, "learning_rate": 7.506457174281587e-07, "logits/chosen": 1.2120341062545776, "logits/rejected": 1.220780849456787, "logps/accuracies": 0.8125, "logps/chosen": -327.4204406738281, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -317.1983337402344, "logps/ref_rejected": -326.3907165527344, "logps/rejected": -362.292724609375, "loss": 0.7512, "rewards/accuracies": 0.75, "rewards/chosen": -1.0222113132476807, "rewards/grad_term": 0.028988810256123543, "rewards/margins": 2.5679914951324463, "rewards/rejected": -3.590202808380127, "step": 31 }, { "epoch": 0.06638143394269415, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 24.42268750796852, "learning_rate": 7.575857620481944e-07, "logits/chosen": 1.2759184837341309, "logits/rejected": 1.330209732055664, "logps/accuracies": 0.75, "logps/chosen": -352.6253967285156, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -354.62640380859375, "logps/ref_rejected": -366.90618896484375, "logps/rejected": -401.5226135253906, "loss": 0.6974, "rewards/accuracies": 0.8125, "rewards/chosen": 0.20009994506835938, "rewards/grad_term": 0.021008620038628578, "rewards/margins": 3.6617395877838135, "rewards/rejected": -3.461639404296875, "step": 32 }, { "epoch": 0.06845585375340335, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 24.880719455281493, "learning_rate": 7.643122325971209e-07, "logits/chosen": 1.103371262550354, "logits/rejected": 1.1177877187728882, "logps/accuracies": 0.875, "logps/chosen": -307.4759826660156, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -311.5048828125, "logps/ref_rejected": -314.81597900390625, "logps/rejected": -364.4283447265625, "loss": 0.7078, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4028913974761963, "rewards/grad_term": 0.007870044559240341, "rewards/margins": 5.364123821258545, "rewards/rejected": -4.961232662200928, "step": 33 }, { "epoch": 0.07053027356411254, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 19.643933011479398, "learning_rate": 7.708378826961021e-07, "logits/chosen": 1.102717399597168, "logits/rejected": 1.2488610744476318, "logps/accuracies": 0.75, "logps/chosen": -334.2718200683594, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -332.568603515625, "logps/ref_rejected": -440.8373718261719, "logps/rejected": -474.8569030761719, "loss": 0.6622, "rewards/accuracies": 0.875, "rewards/chosen": -0.170322984457016, "rewards/grad_term": 0.02305561862885952, "rewards/margins": 3.231626510620117, "rewards/rejected": -3.401949882507324, "step": 34 }, { "epoch": 0.07260469337482173, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.375, "grad_norm": 22.64669553529777, "learning_rate": 7.771743566306228e-07, "logits/chosen": 1.0699303150177002, "logits/rejected": 1.0104891061782837, "logps/accuracies": 0.5625, "logps/chosen": -356.24755859375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -349.8689880371094, "logps/ref_rejected": -338.50445556640625, "logps/rejected": -369.7318115234375, "loss": 0.7305, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6378520131111145, "rewards/grad_term": 0.030929066240787506, "rewards/margins": 2.4848828315734863, "rewards/rejected": -3.1227352619171143, "step": 35 }, { "epoch": 0.07467911318553092, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 27.940317128258794, "learning_rate": 7.833323143899373e-07, "logits/chosen": 0.8235185146331787, "logits/rejected": 0.8534129858016968, "logps/accuracies": 0.75, "logps/chosen": -311.9441833496094, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -311.42791748046875, "logps/ref_rejected": -301.2667541503906, "logps/rejected": -360.82537841796875, "loss": 0.6602, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05162731558084488, "rewards/grad_term": 0.01031492929905653, "rewards/margins": 5.904232025146484, "rewards/rejected": -5.955859661102295, "step": 36 }, { "epoch": 0.07675353299624012, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 59.21163950114599, "learning_rate": 7.893215395709077e-07, "logits/chosen": 0.6561946868896484, "logits/rejected": 0.7136399745941162, "logps/accuracies": 0.875, "logps/chosen": -280.95098876953125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -277.3229064941406, "logps/ref_rejected": -273.8558349609375, "logps/rejected": -336.818115234375, "loss": 0.6433, "rewards/accuracies": 0.9375, "rewards/chosen": -0.36280834674835205, "rewards/grad_term": 0.008878666907548904, "rewards/margins": 5.933422088623047, "rewards/rejected": -6.296230316162109, "step": 37 }, { "epoch": 0.0788279528069493, "flips/correct->correct": 0.125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.5, "grad_norm": 49.21026179278685, "learning_rate": 7.951510328891689e-07, "logits/chosen": 1.0223195552825928, "logits/rejected": 0.9707791209220886, "logps/accuracies": 0.5, "logps/chosen": -251.95631408691406, "logps/ref_accuracies": 0.125, "logps/ref_chosen": -241.30072021484375, "logps/ref_rejected": -225.43099975585938, "logps/rejected": -282.074951171875, "loss": 0.6902, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0655571222305298, "rewards/grad_term": 0.019987476989626884, "rewards/margins": 4.59883975982666, "rewards/rejected": -5.664397239685059, "step": 38 }, { "epoch": 0.0809023726176585, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 27.79357547798861, "learning_rate": 8.008290935415948e-07, "logits/chosen": 0.7353692650794983, "logits/rejected": 0.8016875386238098, "logps/accuracies": 0.8125, "logps/chosen": -287.4935607910156, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -283.6936950683594, "logps/ref_rejected": -283.30487060546875, "logps/rejected": -337.9640197753906, "loss": 0.6843, "rewards/accuracies": 0.875, "rewards/chosen": -0.3799862861633301, "rewards/grad_term": 0.01074125524610281, "rewards/margins": 5.085926055908203, "rewards/rejected": -5.465912818908691, "step": 39 }, { "epoch": 0.0829767924283677, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 52.06939047981281, "learning_rate": 8.063633902661875e-07, "logits/chosen": 0.9137625694274902, "logits/rejected": 0.9001289010047913, "logps/accuracies": 0.8125, "logps/chosen": -308.0149841308594, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -298.7401428222656, "logps/ref_rejected": -289.8734436035156, "logps/rejected": -362.60211181640625, "loss": 0.6861, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9274865984916687, "rewards/grad_term": 0.015340002253651619, "rewards/margins": 6.3453826904296875, "rewards/rejected": -7.272869110107422, "step": 40 }, { "epoch": 0.08505121223907688, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 30.550457494843638, "learning_rate": 8.117610236262845e-07, "logits/chosen": 0.7508188486099243, "logits/rejected": 0.8092616200447083, "logps/accuracies": 0.8125, "logps/chosen": -344.6716613769531, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -332.699462890625, "logps/ref_rejected": -344.08209228515625, "logps/rejected": -386.7120056152344, "loss": 0.7017, "rewards/accuracies": 0.875, "rewards/chosen": -1.1972177028656006, "rewards/grad_term": 0.02788725309073925, "rewards/margins": 3.065772771835327, "rewards/rejected": -4.262990474700928, "step": 41 }, { "epoch": 0.08712563204978607, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 25.64415652049263, "learning_rate": 8.170285807883206e-07, "logits/chosen": 0.6477910876274109, "logits/rejected": 0.8107466697692871, "logps/accuracies": 0.8125, "logps/chosen": -261.5460205078125, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -256.37158203125, "logps/ref_rejected": -280.9100646972656, "logps/rejected": -316.6261901855469, "loss": 0.6765, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5174452066421509, "rewards/grad_term": 0.02346464805305004, "rewards/margins": 3.054164409637451, "rewards/rejected": -3.5716099739074707, "step": 42 }, { "epoch": 0.08920005186049526, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 35.35935314371997, "learning_rate": 8.221721838532495e-07, "logits/chosen": 0.6840221285820007, "logits/rejected": 0.6609375476837158, "logps/accuracies": 0.6875, "logps/chosen": -291.4024353027344, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -296.0429992675781, "logps/ref_rejected": -282.0372009277344, "logps/rejected": -319.9362487792969, "loss": 0.5996, "rewards/accuracies": 1.0, "rewards/chosen": 0.4640587866306305, "rewards/grad_term": 0.011877249926328659, "rewards/margins": 4.253963470458984, "rewards/rejected": -3.7899045944213867, "step": 43 }, { "epoch": 0.09127447167120446, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 28.87938199876403, "learning_rate": 8.271975326310688e-07, "logits/chosen": 0.8031829595565796, "logits/rejected": 0.7779420614242554, "logps/accuracies": 0.5625, "logps/chosen": -308.4298095703125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -299.1800537109375, "logps/ref_rejected": -301.2965087890625, "logps/rejected": -333.80767822265625, "loss": 0.65, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9249745607376099, "rewards/grad_term": 0.028043199330568314, "rewards/margins": 2.326139450073242, "rewards/rejected": -3.2511138916015625, "step": 44 }, { "epoch": 0.09334889148191365, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 27.45541909883828, "learning_rate": 8.321099426079305e-07, "logits/chosen": 0.6578277349472046, "logits/rejected": 0.7826619148254395, "logps/accuracies": 0.875, "logps/chosen": -284.1718444824219, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -278.0632019042969, "logps/ref_rejected": -310.51953125, "logps/rejected": -362.188720703125, "loss": 0.6724, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6108630895614624, "rewards/grad_term": 0.015516946092247963, "rewards/margins": 4.556060791015625, "rewards/rejected": -5.1669230461120605, "step": 45 }, { "epoch": 0.09542331129262284, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 59.28118284951873, "learning_rate": 8.369143787399735e-07, "logits/chosen": 0.9397487044334412, "logits/rejected": 0.9460306167602539, "logps/accuracies": 0.75, "logps/chosen": -250.52760314941406, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -249.87916564941406, "logps/ref_rejected": -253.21328735351562, "logps/rejected": -280.88421630859375, "loss": 0.7086, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0648447573184967, "rewards/grad_term": 0.024975696578621864, "rewards/margins": 2.702247142791748, "rewards/rejected": -2.7670915126800537, "step": 46 }, { "epoch": 0.09749773110333204, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 27.055341319651774, "learning_rate": 8.416154856125216e-07, "logits/chosen": 0.8418172597885132, "logits/rejected": 0.8614631295204163, "logps/accuracies": 0.6875, "logps/chosen": -289.743408203125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -288.4799499511719, "logps/ref_rejected": -295.8186340332031, "logps/rejected": -333.06170654296875, "loss": 0.648, "rewards/accuracies": 0.875, "rewards/chosen": -0.12634362280368805, "rewards/grad_term": 0.015468433499336243, "rewards/margins": 3.597963571548462, "rewards/rejected": -3.724307060241699, "step": 47 }, { "epoch": 0.09957215091404123, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 29.320128400390026, "learning_rate": 8.462176144238853e-07, "logits/chosen": 1.0445611476898193, "logits/rejected": 1.080256700515747, "logps/accuracies": 0.875, "logps/chosen": -277.6840515136719, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -284.16644287109375, "logps/ref_rejected": -313.25799560546875, "logps/rejected": -363.92913818359375, "loss": 0.6148, "rewards/accuracies": 0.875, "rewards/chosen": 0.648241400718689, "rewards/grad_term": 0.01066309679299593, "rewards/margins": 5.71535062789917, "rewards/rejected": -5.067109107971191, "step": 48 }, { "epoch": 0.10164657072475042, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 30.36633688848658, "learning_rate": 8.507248471867036e-07, "logits/chosen": 1.0277738571166992, "logits/rejected": 0.9966739416122437, "logps/accuracies": 0.625, "logps/chosen": -354.4734191894531, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -353.09027099609375, "logps/ref_rejected": -352.4345397949219, "logps/rejected": -385.0770263671875, "loss": 0.6356, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13831399381160736, "rewards/grad_term": 0.021235931664705276, "rewards/margins": 3.1259407997131348, "rewards/rejected": -3.264254570007324, "step": 49 }, { "epoch": 0.10372099053545962, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 34.234438693372084, "learning_rate": 8.551410184841808e-07, "logits/chosen": 0.8633083701133728, "logits/rejected": 0.860028088092804, "logps/accuracies": 0.6875, "logps/chosen": -252.79647827148438, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -252.6006317138672, "logps/ref_rejected": -258.76251220703125, "logps/rejected": -295.47882080078125, "loss": 0.65, "rewards/accuracies": 1.0, "rewards/chosen": -0.01958458498120308, "rewards/grad_term": 0.01410377025604248, "rewards/margins": 3.6520471572875977, "rewards/rejected": -3.6716315746307373, "step": 50 }, { "epoch": 0.10579541034616881, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.375, "grad_norm": 29.660260139951582, "learning_rate": 8.59469735071793e-07, "logits/chosen": 0.38166430592536926, "logits/rejected": 0.4328911304473877, "logps/accuracies": 0.625, "logps/chosen": -296.5737609863281, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -290.32611083984375, "logps/ref_rejected": -293.0495910644531, "logps/rejected": -349.3868408203125, "loss": 0.6285, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6247656941413879, "rewards/grad_term": 0.017583010718226433, "rewards/margins": 5.008961200714111, "rewards/rejected": -5.633727073669434, "step": 51 }, { "epoch": 0.107869830156878, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 25.949283947333324, "learning_rate": 8.637143935755428e-07, "logits/chosen": 0.727641224861145, "logits/rejected": 0.7505197525024414, "logps/accuracies": 0.625, "logps/chosen": -288.1712646484375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -278.9561767578125, "logps/ref_rejected": -265.6705322265625, "logps/rejected": -308.8449401855469, "loss": 0.6146, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9215071201324463, "rewards/grad_term": 0.022838197648525238, "rewards/margins": 3.395932912826538, "rewards/rejected": -4.317440032958984, "step": 52 }, { "epoch": 0.10994424996758718, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 24.134309233597588, "learning_rate": 8.678781965043402e-07, "logits/chosen": 0.7036612033843994, "logits/rejected": 0.6557080745697021, "logps/accuracies": 0.875, "logps/chosen": -358.0240478515625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -348.71356201171875, "logps/ref_rejected": -355.6617431640625, "logps/rejected": -404.3010559082031, "loss": 0.6916, "rewards/accuracies": 0.75, "rewards/chosen": -0.9310531616210938, "rewards/grad_term": 0.02438879944384098, "rewards/margins": 3.93287992477417, "rewards/rejected": -4.863933086395264, "step": 53 }, { "epoch": 0.11201866977829639, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.375, "grad_norm": 53.64626463678004, "learning_rate": 8.719641667656282e-07, "logits/chosen": 0.6714786887168884, "logits/rejected": 0.5823845863342285, "logps/accuracies": 0.625, "logps/chosen": -376.20220947265625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -369.01507568359375, "logps/ref_rejected": -328.4320373535156, "logps/rejected": -383.90655517578125, "loss": 0.6963, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7187104821205139, "rewards/grad_term": 0.011453664861619473, "rewards/margins": 4.828742980957031, "rewards/rejected": -5.5474534034729, "step": 54 }, { "epoch": 0.11409308958900558, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 70.929778345069, "learning_rate": 8.759751608490621e-07, "logits/chosen": 0.44098129868507385, "logits/rejected": 0.5110803842544556, "logps/accuracies": 0.8125, "logps/chosen": -307.63323974609375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -303.2783203125, "logps/ref_rejected": -305.9503173828125, "logps/rejected": -365.0198669433594, "loss": 0.6622, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4354937672615051, "rewards/grad_term": 0.016174456104636192, "rewards/margins": 5.471461296081543, "rewards/rejected": -5.906955242156982, "step": 55 }, { "epoch": 0.11616750939971476, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 26.619296281410804, "learning_rate": 8.799138808222686e-07, "logits/chosen": 0.7330751419067383, "logits/rejected": 0.9024415016174316, "logps/accuracies": 0.75, "logps/chosen": -235.89239501953125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -231.52980041503906, "logps/ref_rejected": -262.5042724609375, "logps/rejected": -304.2793273925781, "loss": 0.6587, "rewards/accuracies": 0.875, "rewards/chosen": -0.43625733256340027, "rewards/grad_term": 0.020225245505571365, "rewards/margins": 3.7412445545196533, "rewards/rejected": -4.177502155303955, "step": 56 }, { "epoch": 0.11824192921042397, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 27.602722271896948, "learning_rate": 8.837828852648599e-07, "logits/chosen": 0.5326017737388611, "logits/rejected": 0.6437039971351624, "logps/accuracies": 0.8125, "logps/chosen": -301.2654113769531, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -306.17083740234375, "logps/ref_rejected": -299.8456115722656, "logps/rejected": -362.2225036621094, "loss": 0.6253, "rewards/accuracies": 0.875, "rewards/chosen": 0.4905431568622589, "rewards/grad_term": 0.014911260455846786, "rewards/margins": 6.728227615356445, "rewards/rejected": -6.2376837730407715, "step": 57 }, { "epoch": 0.12031634902113315, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.375, "grad_norm": 28.670654744865576, "learning_rate": 8.875845992515123e-07, "logits/chosen": 0.38607218861579895, "logits/rejected": 0.414478600025177, "logps/accuracies": 0.625, "logps/chosen": -328.02618408203125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -322.28265380859375, "logps/ref_rejected": -297.2394104003906, "logps/rejected": -336.27459716796875, "loss": 0.6757, "rewards/accuracies": 0.75, "rewards/chosen": -0.5743532180786133, "rewards/grad_term": 0.023728108033537865, "rewards/margins": 3.329164505004883, "rewards/rejected": -3.903517961502075, "step": 58 }, { "epoch": 0.12239076883184234, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 31.46030833209825, "learning_rate": 8.91321323481661e-07, "logits/chosen": 0.6807994246482849, "logits/rejected": 0.7104217410087585, "logps/accuracies": 0.8125, "logps/chosen": -331.4752502441406, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -335.2398986816406, "logps/ref_rejected": -332.1490173339844, "logps/rejected": -374.0831298828125, "loss": 0.6771, "rewards/accuracies": 0.9375, "rewards/chosen": 0.37646305561065674, "rewards/grad_term": 0.010923169553279877, "rewards/margins": 4.56987190246582, "rewards/rejected": -4.193408966064453, "step": 59 }, { "epoch": 0.12446518864255153, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 23.984091347684366, "learning_rate": 8.949952426418784e-07, "logits/chosen": 0.568733811378479, "logits/rejected": 0.635265052318573, "logps/accuracies": 0.6875, "logps/chosen": -397.9205322265625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -402.82952880859375, "logps/ref_rejected": -363.6296691894531, "logps/rejected": -400.825439453125, "loss": 0.6358, "rewards/accuracies": 0.875, "rewards/chosen": 0.4909006357192993, "rewards/grad_term": 0.015072712674736977, "rewards/margins": 4.210475921630859, "rewards/rejected": -3.7195756435394287, "step": 60 }, { "epoch": 0.12653960845326073, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.125, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 73.64774037078915, "learning_rate": 8.986084330770518e-07, "logits/chosen": 0.7834938764572144, "logits/rejected": 0.8703972101211548, "logps/accuracies": 0.6875, "logps/chosen": -256.0115661621094, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -239.9134521484375, "logps/ref_rejected": -261.8016662597656, "logps/rejected": -313.2479248046875, "loss": 0.6474, "rewards/accuracies": 0.875, "rewards/chosen": -1.6098082065582275, "rewards/grad_term": 0.01827179826796055, "rewards/margins": 3.534818172454834, "rewards/rejected": -5.144626140594482, "step": 61 }, { "epoch": 0.12861402826396992, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 25.70529599001306, "learning_rate": 9.021628698377976e-07, "logits/chosen": 0.5873112082481384, "logits/rejected": 0.6506080627441406, "logps/accuracies": 0.75, "logps/chosen": -274.6400451660156, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -276.3179016113281, "logps/ref_rejected": -279.714599609375, "logps/rejected": -340.3433532714844, "loss": 0.6359, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16778606176376343, "rewards/grad_term": 0.014593811705708504, "rewards/margins": 6.230656623840332, "rewards/rejected": -6.062870979309082, "step": 62 }, { "epoch": 0.1306884480746791, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 29.57488992234054, "learning_rate": 9.056604331640114e-07, "logits/chosen": 0.511448323726654, "logits/rejected": 0.4164316654205322, "logps/accuracies": 0.75, "logps/chosen": -254.2750244140625, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -253.3539276123047, "logps/ref_rejected": -264.7762145996094, "logps/rejected": -299.82305908203125, "loss": 0.6508, "rewards/accuracies": 0.875, "rewards/chosen": -0.09211038053035736, "rewards/grad_term": 0.026033716276288033, "rewards/margins": 3.4125752449035645, "rewards/rejected": -3.504685401916504, "step": 63 }, { "epoch": 0.1327628678853883, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 55.6766766699201, "learning_rate": 9.091029144578332e-07, "logits/chosen": 0.5473611354827881, "logits/rejected": 0.6334167122840881, "logps/accuracies": 0.8125, "logps/chosen": -307.349365234375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -308.5955505371094, "logps/ref_rejected": -327.8055725097656, "logps/rejected": -374.07574462890625, "loss": 0.6354, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12462212890386581, "rewards/grad_term": 0.011786059476435184, "rewards/margins": 4.751638412475586, "rewards/rejected": -4.627016067504883, "step": 64 }, { "epoch": 0.1327628678853883, "eval_flips/correct->correct": 0.43842363357543945, "eval_flips/correct->incorrect": 0.004926108289510012, "eval_flips/incorrect->correct": 0.2660098373889923, "eval_flips/incorrect->incorrect": 0.29064038395881653, "eval_logits/chosen": 0.5654913783073425, "eval_logits/rejected": 0.6160324215888977, "eval_logps/accuracies": 0.7044335007667542, "eval_logps/chosen": -288.4407958984375, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -328.46038818359375, "eval_loss": 0.6570103168487549, "eval_rewards/accuracies": 0.8325123190879822, "eval_rewards/chosen": -0.10896830260753632, "eval_rewards/grad_term": 0.021043213084340096, "eval_rewards/margins": 3.8324687480926514, "eval_rewards/rejected": -3.9414374828338623, "eval_runtime": 786.9931, "eval_samples_per_second": 2.056, "eval_steps_per_second": 0.258, "step": 64 }, { "epoch": 0.13483728769609749, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 24.735517627215668, "learning_rate": 9.124920217935358e-07, "logits/chosen": 0.40278834104537964, "logits/rejected": 0.4163047969341278, "logps/accuracies": 0.875, "logps/chosen": -353.63824462890625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -357.9703369140625, "logps/ref_rejected": -365.9423522949219, "logps/rejected": -425.2349853515625, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": 0.4332119822502136, "rewards/grad_term": 0.007745261769741774, "rewards/margins": 6.362478256225586, "rewards/rejected": -5.929266452789307, "step": 65 }, { "epoch": 0.1369117075068067, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 26.834165183727563, "learning_rate": 9.158293850067597e-07, "logits/chosen": 0.387469083070755, "logits/rejected": 0.4058898091316223, "logps/accuracies": 0.8125, "logps/chosen": -252.04205322265625, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -252.20950317382812, "logps/ref_rejected": -263.31280517578125, "logps/rejected": -316.500244140625, "loss": 0.6308, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01674594357609749, "rewards/grad_term": 0.014994516968727112, "rewards/margins": 5.335488319396973, "rewards/rejected": -5.318742275238037, "step": 66 }, { "epoch": 0.1389861273175159, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 31.0308823025867, "learning_rate": 9.191165604010531e-07, "logits/chosen": 0.3395693302154541, "logits/rejected": 0.34473684430122375, "logps/accuracies": 0.75, "logps/chosen": -325.09197998046875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -328.00286865234375, "logps/ref_rejected": -305.96258544921875, "logps/rejected": -359.0819396972656, "loss": 0.6403, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2910885214805603, "rewards/grad_term": 0.009055268950760365, "rewards/margins": 5.603026390075684, "rewards/rejected": -5.3119378089904785, "step": 67 }, { "epoch": 0.14106054712822508, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 26.519671253661027, "learning_rate": 9.22355035105741e-07, "logits/chosen": 0.4188442528247833, "logits/rejected": 0.4437766969203949, "logps/accuracies": 0.6875, "logps/chosen": -293.8087463378906, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -280.9673156738281, "logps/ref_rejected": -302.37518310546875, "logps/rejected": -354.1598815917969, "loss": 0.619, "rewards/accuracies": 0.6875, "rewards/chosen": -1.284143090248108, "rewards/grad_term": 0.02901587449014187, "rewards/margins": 3.8943264484405518, "rewards/rejected": -5.178469657897949, "step": 68 }, { "epoch": 0.14313496693893427, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 30.502044803594153, "learning_rate": 9.255462311156644e-07, "logits/chosen": 0.5335452556610107, "logits/rejected": 0.5705280303955078, "logps/accuracies": 0.5625, "logps/chosen": -320.91192626953125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -304.9333190917969, "logps/ref_rejected": -281.81768798828125, "logps/rejected": -346.9627685546875, "loss": 0.6755, "rewards/accuracies": 0.875, "rewards/chosen": -1.5978612899780273, "rewards/grad_term": 0.018111273646354675, "rewards/margins": 4.916650295257568, "rewards/rejected": -6.514511585235596, "step": 69 }, { "epoch": 0.14520938674964345, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 39.33816705000978, "learning_rate": 9.286915090402617e-07, "logits/chosen": 0.4920622706413269, "logits/rejected": 0.5008682012557983, "logps/accuracies": 0.8125, "logps/chosen": -302.3096618652344, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -291.40972900390625, "logps/ref_rejected": -286.4915771484375, "logps/rejected": -359.8939514160156, "loss": 0.6369, "rewards/accuracies": 0.75, "rewards/chosen": -1.0899897813796997, "rewards/grad_term": 0.0171764325350523, "rewards/margins": 6.250240325927734, "rewards/rejected": -7.340230464935303, "step": 70 }, { "epoch": 0.14728380656035264, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 32.47379922437437, "learning_rate": 9.317921715867286e-07, "logits/chosen": 0.5690668225288391, "logits/rejected": 0.6497770547866821, "logps/accuracies": 0.75, "logps/chosen": -300.4138488769531, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -293.0126647949219, "logps/ref_rejected": -293.5539855957031, "logps/rejected": -361.0144348144531, "loss": 0.6126, "rewards/accuracies": 0.875, "rewards/chosen": -0.7401193976402283, "rewards/grad_term": 0.012420150451362133, "rewards/margins": 6.005928039550781, "rewards/rejected": -6.746047019958496, "step": 71 }, { "epoch": 0.14935822637106183, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 35.11526252181687, "learning_rate": 9.348494667995762e-07, "logits/chosen": 0.5223222970962524, "logits/rejected": 0.6166201829910278, "logps/accuracies": 0.875, "logps/chosen": -262.4486083984375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -247.41668701171875, "logps/ref_rejected": -251.63619995117188, "logps/rejected": -323.4156494140625, "loss": 0.6372, "rewards/accuracies": 0.8125, "rewards/chosen": -1.503194808959961, "rewards/grad_term": 0.016301069408655167, "rewards/margins": 5.674752712249756, "rewards/rejected": -7.177947521209717, "step": 72 }, { "epoch": 0.15143264618177105, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 33.25143057906705, "learning_rate": 9.378645910767493e-07, "logits/chosen": 0.5215972065925598, "logits/rejected": 0.4775215685367584, "logps/accuracies": 0.6875, "logps/chosen": -257.8221435546875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -257.3147888183594, "logps/ref_rejected": -245.8674774169922, "logps/rejected": -302.9472961425781, "loss": 0.6342, "rewards/accuracies": 1.0, "rewards/chosen": -0.050737857818603516, "rewards/grad_term": 0.01016687136143446, "rewards/margins": 5.657248497009277, "rewards/rejected": -5.707986831665039, "step": 73 }, { "epoch": 0.15350706599248023, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.6875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 52.55742381973749, "learning_rate": 9.408386919805467e-07, "logits/chosen": 0.7360602021217346, "logits/rejected": 0.70041424036026, "logps/accuracies": 0.9375, "logps/chosen": -317.7826843261719, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -302.98712158203125, "logps/ref_rejected": -267.1181945800781, "logps/rejected": -356.98870849609375, "loss": 0.6432, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4795535802841187, "rewards/grad_term": 0.008950343355536461, "rewards/margins": 7.507498741149902, "rewards/rejected": -8.987051963806152, "step": 74 }, { "epoch": 0.15558148580318942, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 26.35632884945645, "learning_rate": 9.437728708598716e-07, "logits/chosen": 0.3639271855354309, "logits/rejected": 0.38472047448158264, "logps/accuracies": 0.875, "logps/chosen": -278.147216796875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -264.1582336425781, "logps/ref_rejected": -274.158203125, "logps/rejected": -352.51422119140625, "loss": 0.6529, "rewards/accuracies": 0.875, "rewards/chosen": -1.3989008665084839, "rewards/grad_term": 0.011926427483558655, "rewards/margins": 6.436697959899902, "rewards/rejected": -7.835598945617676, "step": 75 }, { "epoch": 0.1576559056138986, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 43.67163684751143, "learning_rate": 9.466681852988078e-07, "logits/chosen": 0.6780661344528198, "logits/rejected": 0.7738847732543945, "logps/accuracies": 0.75, "logps/chosen": -286.3451843261719, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -271.2528991699219, "logps/ref_rejected": -271.053955078125, "logps/rejected": -328.85772705078125, "loss": 0.6067, "rewards/accuracies": 0.75, "rewards/chosen": -1.5092324018478394, "rewards/grad_term": 0.023612529039382935, "rewards/margins": 4.271145820617676, "rewards/rejected": -5.780378341674805, "step": 76 }, { "epoch": 0.1597303254246078, "flips/correct->correct": 0.875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.0, "grad_norm": 22.36636463977758, "learning_rate": 9.495256514051431e-07, "logits/chosen": 0.4788045287132263, "logits/rejected": 0.549846887588501, "logps/accuracies": 1.0, "logps/chosen": -222.5209197998047, "logps/ref_accuracies": 0.875, "logps/ref_chosen": -207.42868041992188, "logps/ref_rejected": -236.1974334716797, "logps/rejected": -293.5431823730469, "loss": 0.6448, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5092250108718872, "rewards/grad_term": 0.0173178743571043, "rewards/margins": 4.225347995758057, "rewards/rejected": -5.734574317932129, "step": 77 }, { "epoch": 0.161804745235317, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 25.267712307372168, "learning_rate": 9.523462459512337e-07, "logits/chosen": 0.5372971892356873, "logits/rejected": 0.6544579863548279, "logps/accuracies": 0.9375, "logps/chosen": -278.4732360839844, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -275.07958984375, "logps/ref_rejected": -292.14898681640625, "logps/rejected": -352.7454833984375, "loss": 0.6166, "rewards/accuracies": 0.875, "rewards/chosen": -0.3393683433532715, "rewards/grad_term": 0.022649819031357765, "rewards/margins": 5.720274925231934, "rewards/rejected": -6.059643745422363, "step": 78 }, { "epoch": 0.16387916504602618, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 57.52603839415039, "learning_rate": 9.551309083784976e-07, "logits/chosen": 0.6397267580032349, "logits/rejected": 0.7187516093254089, "logps/accuracies": 0.9375, "logps/chosen": -273.272705078125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -278.8054504394531, "logps/ref_rejected": -292.9872741699219, "logps/rejected": -340.0445861816406, "loss": 0.6701, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5532730221748352, "rewards/grad_term": 0.014312355779111385, "rewards/margins": 5.259001731872559, "rewards/rejected": -4.705729007720947, "step": 79 }, { "epoch": 0.1659535848567354, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 50.777119418812084, "learning_rate": 9.578805426758263e-07, "logits/chosen": 0.4606146216392517, "logits/rejected": 0.46222275495529175, "logps/accuracies": 0.8125, "logps/chosen": -292.800537109375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -291.5415954589844, "logps/ref_rejected": -313.3748474121094, "logps/rejected": -364.9443054199219, "loss": 0.6283, "rewards/accuracies": 0.875, "rewards/chosen": -0.1258973628282547, "rewards/grad_term": 0.018451694399118423, "rewards/margins": 5.031045436859131, "rewards/rejected": -5.156942367553711, "step": 80 }, { "epoch": 0.16802800466744458, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 25.904014404983347, "learning_rate": 9.605960191413192e-07, "logits/chosen": 0.5609871745109558, "logits/rejected": 0.646887481212616, "logps/accuracies": 0.6875, "logps/chosen": -388.10205078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -396.9491271972656, "logps/ref_rejected": -395.2713928222656, "logps/rejected": -423.269287109375, "loss": 0.5963, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8847097158432007, "rewards/grad_term": 0.024479346349835396, "rewards/margins": 3.6844961643218994, "rewards/rejected": -2.7997865676879883, "step": 81 }, { "epoch": 0.17010242447815377, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 18.071356978636363, "learning_rate": 9.632781760359235e-07, "logits/chosen": 0.2946923077106476, "logits/rejected": 0.26006707549095154, "logps/accuracies": 0.6875, "logps/chosen": -222.20687866210938, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -218.695068359375, "logps/ref_rejected": -223.76553344726562, "logps/rejected": -264.6587829589844, "loss": 0.6335, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3511809706687927, "rewards/grad_term": 0.025286730378866196, "rewards/margins": 3.7381458282470703, "rewards/rejected": -4.08932638168335, "step": 82 }, { "epoch": 0.17217684428886296, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 29.580870064695095, "learning_rate": 9.659278211368498e-07, "logits/chosen": 0.653415322303772, "logits/rejected": 0.7497892379760742, "logps/accuracies": 0.875, "logps/chosen": -334.653564453125, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -324.0084533691406, "logps/ref_rejected": -340.58624267578125, "logps/rejected": -422.7427978515625, "loss": 0.6484, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0645086765289307, "rewards/grad_term": 0.018634023144841194, "rewards/margins": 7.15114688873291, "rewards/rejected": -8.215656280517578, "step": 83 }, { "epoch": 0.17425126409957215, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 37.56856528542604, "learning_rate": 9.685457331979593e-07, "logits/chosen": 0.7688320875167847, "logits/rejected": 0.913873553276062, "logps/accuracies": 0.8125, "logps/chosen": -252.0431671142578, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -241.2303466796875, "logps/ref_rejected": -278.7004699707031, "logps/rejected": -341.7118835449219, "loss": 0.6808, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0812804698944092, "rewards/grad_term": 0.022220587357878685, "rewards/margins": 5.219861030578613, "rewards/rejected": -6.301141738891602, "step": 84 }, { "epoch": 0.17632568391028133, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 25.08015845081592, "learning_rate": 9.711326633237342e-07, "logits/chosen": 0.6746060252189636, "logits/rejected": 0.6128141283988953, "logps/accuracies": 0.8125, "logps/chosen": -324.61865234375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -314.5938415527344, "logps/ref_rejected": -327.64666748046875, "logps/rejected": -388.7850036621094, "loss": 0.58, "rewards/accuracies": 0.8125, "rewards/chosen": -1.002484679222107, "rewards/grad_term": 0.017657626420259476, "rewards/margins": 5.111349582672119, "rewards/rejected": -6.113834857940674, "step": 85 }, { "epoch": 0.17840010372099052, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 28.420792350466737, "learning_rate": 9.736893362628883e-07, "logits/chosen": 0.49216994643211365, "logits/rejected": 0.5920721888542175, "logps/accuracies": 0.9375, "logps/chosen": -299.5179443359375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -294.0077209472656, "logps/ref_rejected": -302.6850280761719, "logps/rejected": -385.48388671875, "loss": 0.6414, "rewards/accuracies": 1.0, "rewards/chosen": -0.5510200262069702, "rewards/grad_term": 0.004014983773231506, "rewards/margins": 7.728863716125488, "rewards/rejected": -8.279884338378906, "step": 86 }, { "epoch": 0.18047452353169974, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 32.17317573293341, "learning_rate": 9.762164516272033e-07, "logits/chosen": 0.7234176397323608, "logits/rejected": 0.7146831154823303, "logps/accuracies": 0.9375, "logps/chosen": -299.3135681152344, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -294.986083984375, "logps/ref_rejected": -306.23895263671875, "logps/rejected": -362.81964111328125, "loss": 0.6571, "rewards/accuracies": 0.9375, "rewards/chosen": -0.43274781107902527, "rewards/grad_term": 0.012466475367546082, "rewards/margins": 5.2253193855285645, "rewards/rejected": -5.658066749572754, "step": 87 }, { "epoch": 0.18254894334240893, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 26.897654394769035, "learning_rate": 9.787146850407078e-07, "logits/chosen": 0.47364750504493713, "logits/rejected": 0.5636922717094421, "logps/accuracies": 0.75, "logps/chosen": -264.1487121582031, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -270.39495849609375, "logps/ref_rejected": -258.71246337890625, "logps/rejected": -319.8270263671875, "loss": 0.6117, "rewards/accuracies": 0.875, "rewards/chosen": 0.6246242523193359, "rewards/grad_term": 0.01352761872112751, "rewards/margins": 6.736079216003418, "rewards/rejected": -6.111454963684082, "step": 88 }, { "epoch": 0.18462336315311811, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 36.70610652642651, "learning_rate": 9.811846892239293e-07, "logits/chosen": 0.1739477515220642, "logits/rejected": 0.20079316198825836, "logps/accuracies": 0.875, "logps/chosen": -334.56201171875, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -319.6775207519531, "logps/ref_rejected": -328.6192626953125, "logps/rejected": -389.33929443359375, "loss": 0.5805, "rewards/accuracies": 0.625, "rewards/chosen": -1.4884480237960815, "rewards/grad_term": 0.028538305312395096, "rewards/margins": 4.583554267883301, "rewards/rejected": -6.072002410888672, "step": 89 }, { "epoch": 0.1866977829638273, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 38.62477740343346, "learning_rate": 9.836270950175693e-07, "logits/chosen": 0.5048727989196777, "logits/rejected": 0.5224493741989136, "logps/accuracies": 0.875, "logps/chosen": -265.7325439453125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -250.5098419189453, "logps/ref_rejected": -255.43650817871094, "logps/rejected": -315.4735107421875, "loss": 0.6476, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5222673416137695, "rewards/grad_term": 0.023190699517726898, "rewards/margins": 4.481435298919678, "rewards/rejected": -6.003702640533447, "step": 90 }, { "epoch": 0.1887722027745365, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 36.7027529146051, "learning_rate": 9.860425123496167e-07, "logits/chosen": 0.5219244360923767, "logits/rejected": 0.5849474668502808, "logps/accuracies": 0.9375, "logps/chosen": -240.11685180664062, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -234.85340881347656, "logps/ref_rejected": -262.1112060546875, "logps/rejected": -327.15716552734375, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": -0.5263462662696838, "rewards/grad_term": 0.004026439506560564, "rewards/margins": 5.978251934051514, "rewards/rejected": -6.504598140716553, "step": 91 }, { "epoch": 0.19084662258524568, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 34.422228673025685, "learning_rate": 9.884315311496123e-07, "logits/chosen": 0.5342029929161072, "logits/rejected": 0.5386108160018921, "logps/accuracies": 0.75, "logps/chosen": -340.9605407714844, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -338.02606201171875, "logps/ref_rejected": -346.3376770019531, "logps/rejected": -377.7868347167969, "loss": 0.5815, "rewards/accuracies": 0.875, "rewards/chosen": -0.29344886541366577, "rewards/grad_term": 0.020996563136577606, "rewards/margins": 2.8514671325683594, "rewards/rejected": -3.1449155807495117, "step": 92 }, { "epoch": 0.1929210423959549, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 25.326369311886236, "learning_rate": 9.907947222134885e-07, "logits/chosen": 0.4443345069885254, "logits/rejected": 0.4642353653907776, "logps/accuracies": 0.875, "logps/chosen": -346.2325744628906, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -345.8390197753906, "logps/ref_rejected": -357.72564697265625, "logps/rejected": -413.6025085449219, "loss": 0.5793, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03935527801513672, "rewards/grad_term": 0.00771428644657135, "rewards/margins": 5.548335552215576, "rewards/rejected": -5.587691783905029, "step": 93 }, { "epoch": 0.19499546220666408, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 21.376114840937195, "learning_rate": 9.931326380221604e-07, "logits/chosen": 0.6561794281005859, "logits/rejected": 0.7463537454605103, "logps/accuracies": 0.8125, "logps/chosen": -254.1697540283203, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -258.75799560546875, "logps/ref_rejected": -282.442138671875, "logps/rejected": -320.4227600097656, "loss": 0.5967, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4588264226913452, "rewards/grad_term": 0.02241508476436138, "rewards/margins": 4.256890296936035, "rewards/rejected": -3.7980637550354004, "step": 94 }, { "epoch": 0.19706988201737327, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 28.87335565570023, "learning_rate": 9.95445813516801e-07, "logits/chosen": 0.31641554832458496, "logits/rejected": 0.4115113914012909, "logps/accuracies": 0.8125, "logps/chosen": -305.4784240722656, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -301.1734619140625, "logps/ref_rejected": -309.7505187988281, "logps/rejected": -370.07855224609375, "loss": 0.595, "rewards/accuracies": 1.0, "rewards/chosen": -0.43049633502960205, "rewards/grad_term": 0.007062141317874193, "rewards/margins": 5.602307319641113, "rewards/rejected": -6.032803535461426, "step": 95 }, { "epoch": 0.19914430182808246, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 30.52259082669545, "learning_rate": 9.977347668335242e-07, "logits/chosen": 0.5447170734405518, "logits/rejected": 0.6960605978965759, "logps/accuracies": 0.8125, "logps/chosen": -320.6680603027344, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -323.8368225097656, "logps/ref_rejected": -339.37957763671875, "logps/rejected": -400.385009765625, "loss": 0.6261, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3168814182281494, "rewards/grad_term": 0.006425461731851101, "rewards/margins": 6.417423248291016, "rewards/rejected": -6.100542068481445, "step": 96 }, { "epoch": 0.20121872163879165, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.125, "grad_norm": 51.93086544682398, "learning_rate": 1e-06, "logits/chosen": 0.6568098068237305, "logits/rejected": 0.6733189225196838, "logps/accuracies": 0.875, "logps/chosen": -286.9490966796875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -284.9871826171875, "logps/ref_rejected": -301.6272888183594, "logps/rejected": -357.9549560546875, "loss": 0.5764, "rewards/accuracies": 0.875, "rewards/chosen": -0.19619154930114746, "rewards/grad_term": 0.018322059884667397, "rewards/margins": 5.436576843261719, "rewards/rejected": -5.632768630981445, "step": 97 }, { "epoch": 0.20329314144950084, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 28.258232668290642, "learning_rate": 1e-06, "logits/chosen": 0.41130974888801575, "logits/rejected": 0.47705498337745667, "logps/accuracies": 0.75, "logps/chosen": -322.64227294921875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -310.24725341796875, "logps/ref_rejected": -321.1720275878906, "logps/rejected": -375.3896484375, "loss": 0.6226, "rewards/accuracies": 0.75, "rewards/chosen": -1.239502191543579, "rewards/grad_term": 0.022803550586104393, "rewards/margins": 4.182260990142822, "rewards/rejected": -5.4217634201049805, "step": 98 }, { "epoch": 0.20536756126021002, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.25, "grad_norm": 27.738502482797642, "learning_rate": 9.988465974625143e-07, "logits/chosen": 0.4429183602333069, "logits/rejected": 0.5393229126930237, "logps/accuracies": 0.75, "logps/chosen": -272.3836975097656, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -278.6350402832031, "logps/ref_rejected": -277.7386169433594, "logps/rejected": -315.6932373046875, "loss": 0.6369, "rewards/accuracies": 0.875, "rewards/chosen": 0.6251335740089417, "rewards/grad_term": 0.01758977398276329, "rewards/margins": 4.420593738555908, "rewards/rejected": -3.7954602241516113, "step": 99 }, { "epoch": 0.20744198107091924, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 49.56019663548194, "learning_rate": 9.976931949250289e-07, "logits/chosen": 0.5111449956893921, "logits/rejected": 0.4637998640537262, "logps/accuracies": 0.8125, "logps/chosen": -305.05950927734375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -303.1130065917969, "logps/ref_rejected": -295.25433349609375, "logps/rejected": -367.4417724609375, "loss": 0.635, "rewards/accuracies": 1.0, "rewards/chosen": -0.19465157389640808, "rewards/grad_term": 0.006312578916549683, "rewards/margins": 7.024093151092529, "rewards/rejected": -7.218744277954102, "step": 100 }, { "epoch": 0.20951640088162843, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 17.980236189059163, "learning_rate": 9.965397923875432e-07, "logits/chosen": 0.5828474760055542, "logits/rejected": 0.6235547661781311, "logps/accuracies": 0.8125, "logps/chosen": -270.784912109375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -273.26043701171875, "logps/ref_rejected": -269.11077880859375, "logps/rejected": -325.9720458984375, "loss": 0.6338, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24755248427391052, "rewards/grad_term": 0.01740310713648796, "rewards/margins": 5.933681488037109, "rewards/rejected": -5.686128616333008, "step": 101 }, { "epoch": 0.21159082069233762, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 51.922995548635456, "learning_rate": 9.953863898500576e-07, "logits/chosen": 0.18250882625579834, "logits/rejected": 0.20775896310806274, "logps/accuracies": 0.8125, "logps/chosen": -266.9601745605469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -265.5532531738281, "logps/ref_rejected": -263.236328125, "logps/rejected": -325.6028747558594, "loss": 0.5857, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14069411158561707, "rewards/grad_term": 0.010405524633824825, "rewards/margins": 6.095961570739746, "rewards/rejected": -6.236655235290527, "step": 102 }, { "epoch": 0.2136652405030468, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 58.49656496183437, "learning_rate": 9.94232987312572e-07, "logits/chosen": 0.24150438606739044, "logits/rejected": 0.23409827053546906, "logps/accuracies": 0.6875, "logps/chosen": -275.4272155761719, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -267.8497009277344, "logps/ref_rejected": -259.2445068359375, "logps/rejected": -302.8243103027344, "loss": 0.654, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7577495574951172, "rewards/grad_term": 0.027012387290596962, "rewards/margins": 3.6002304553985596, "rewards/rejected": -4.357979774475098, "step": 103 }, { "epoch": 0.215739660313756, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 33.69936760710596, "learning_rate": 9.930795847750865e-07, "logits/chosen": 0.37147602438926697, "logits/rejected": 0.5065699219703674, "logps/accuracies": 0.8125, "logps/chosen": -246.92913818359375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -244.8133544921875, "logps/ref_rejected": -273.55145263671875, "logps/rejected": -323.93560791015625, "loss": 0.5903, "rewards/accuracies": 0.875, "rewards/chosen": -0.211576446890831, "rewards/grad_term": 0.01827353984117508, "rewards/margins": 4.82683801651001, "rewards/rejected": -5.038414001464844, "step": 104 }, { "epoch": 0.21781408012446518, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 30.373799785451364, "learning_rate": 9.919261822376009e-07, "logits/chosen": 0.650319516658783, "logits/rejected": 0.6357383728027344, "logps/accuracies": 0.6875, "logps/chosen": -262.9386901855469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -259.9366760253906, "logps/ref_rejected": -256.1930236816406, "logps/rejected": -282.44952392578125, "loss": 0.5851, "rewards/accuracies": 1.0, "rewards/chosen": -0.3002018928527832, "rewards/grad_term": 0.028288275003433228, "rewards/margins": 2.3254497051239014, "rewards/rejected": -2.6256518363952637, "step": 105 }, { "epoch": 0.21988849993517437, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 24.848196690851, "learning_rate": 9.907727797001152e-07, "logits/chosen": 0.35767537355422974, "logits/rejected": 0.42828047275543213, "logps/accuracies": 0.9375, "logps/chosen": -260.8604736328125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -265.82379150390625, "logps/ref_rejected": -287.0606689453125, "logps/rejected": -353.9102478027344, "loss": 0.5905, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4963342547416687, "rewards/grad_term": 0.011764682829380035, "rewards/margins": 7.18129301071167, "rewards/rejected": -6.684958457946777, "step": 106 }, { "epoch": 0.22196291974588359, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 50.26567742873495, "learning_rate": 9.896193771626296e-07, "logits/chosen": 0.24374046921730042, "logits/rejected": 0.2071159929037094, "logps/accuracies": 0.8125, "logps/chosen": -322.9122009277344, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -321.7509765625, "logps/ref_rejected": -327.6671142578125, "logps/rejected": -379.50433349609375, "loss": 0.5947, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11612237989902496, "rewards/grad_term": 0.013888241723179817, "rewards/margins": 5.06759786605835, "rewards/rejected": -5.183720588684082, "step": 107 }, { "epoch": 0.22403733955659277, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 24.60066728160171, "learning_rate": 9.884659746251442e-07, "logits/chosen": 0.28119832277297974, "logits/rejected": 0.4410630464553833, "logps/accuracies": 0.8125, "logps/chosen": -263.49688720703125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -259.3692626953125, "logps/ref_rejected": -296.3498229980469, "logps/rejected": -338.8854064941406, "loss": 0.6482, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41276171803474426, "rewards/grad_term": 0.030529310926795006, "rewards/margins": 3.840797185897827, "rewards/rejected": -4.253559112548828, "step": 108 }, { "epoch": 0.22611175936730196, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.375, "grad_norm": 29.88832833016055, "learning_rate": 9.873125720876585e-07, "logits/chosen": 0.4925777018070221, "logits/rejected": 0.39786702394485474, "logps/accuracies": 0.625, "logps/chosen": -288.4710693359375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -291.87298583984375, "logps/ref_rejected": -257.73553466796875, "logps/rejected": -322.43603515625, "loss": 0.6003, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3401949405670166, "rewards/grad_term": 0.005394600797444582, "rewards/margins": 6.810248851776123, "rewards/rejected": -6.4700541496276855, "step": 109 }, { "epoch": 0.22818617917801115, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 23.915753236950135, "learning_rate": 9.861591695501729e-07, "logits/chosen": 0.2205159217119217, "logits/rejected": 0.19697824120521545, "logps/accuracies": 0.8125, "logps/chosen": -352.7537841796875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -355.77337646484375, "logps/ref_rejected": -356.9278564453125, "logps/rejected": -400.95245361328125, "loss": 0.5934, "rewards/accuracies": 0.8125, "rewards/chosen": 0.30196261405944824, "rewards/grad_term": 0.017759006470441818, "rewards/margins": 4.704426288604736, "rewards/rejected": -4.402463436126709, "step": 110 }, { "epoch": 0.23026059898872034, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 27.27299967581395, "learning_rate": 9.850057670126874e-07, "logits/chosen": 0.37821733951568604, "logits/rejected": 0.4970583915710449, "logps/accuracies": 0.75, "logps/chosen": -237.38504028320312, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -230.9459228515625, "logps/ref_rejected": -249.4907684326172, "logps/rejected": -298.8011169433594, "loss": 0.633, "rewards/accuracies": 0.875, "rewards/chosen": -0.6439133286476135, "rewards/grad_term": 0.018191155046224594, "rewards/margins": 4.287120819091797, "rewards/rejected": -4.931033134460449, "step": 111 }, { "epoch": 0.23233501879942953, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 39.145128913057, "learning_rate": 9.838523644752018e-07, "logits/chosen": 0.1512741595506668, "logits/rejected": 0.32822132110595703, "logps/accuracies": 0.75, "logps/chosen": -267.9004821777344, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -264.21697998046875, "logps/ref_rejected": -307.68572998046875, "logps/rejected": -361.28033447265625, "loss": 0.5966, "rewards/accuracies": 0.875, "rewards/chosen": -0.3683478534221649, "rewards/grad_term": 0.017407521605491638, "rewards/margins": 4.9911088943481445, "rewards/rejected": -5.359456539154053, "step": 112 }, { "epoch": 0.23440943861013872, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 27.5954679159607, "learning_rate": 9.826989619377162e-07, "logits/chosen": 0.5426469445228577, "logits/rejected": 0.5697547197341919, "logps/accuracies": 0.75, "logps/chosen": -312.826904296875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -306.6598815917969, "logps/ref_rejected": -277.28387451171875, "logps/rejected": -353.48114013671875, "loss": 0.6009, "rewards/accuracies": 0.875, "rewards/chosen": -0.616702675819397, "rewards/grad_term": 0.01106889545917511, "rewards/margins": 7.003021240234375, "rewards/rejected": -7.619723320007324, "step": 113 }, { "epoch": 0.23648385842084793, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 40.98809530673688, "learning_rate": 9.815455594002307e-07, "logits/chosen": 0.39876848459243774, "logits/rejected": 0.3462454378604889, "logps/accuracies": 0.8125, "logps/chosen": -294.205078125, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -285.1748046875, "logps/ref_rejected": -286.44140625, "logps/rejected": -345.4891662597656, "loss": 0.6112, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9030314087867737, "rewards/grad_term": 0.011187486350536346, "rewards/margins": 5.001744747161865, "rewards/rejected": -5.904776573181152, "step": 114 }, { "epoch": 0.23855827823155712, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 40.11356790900559, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.5221942067146301, "logits/rejected": 0.4882541298866272, "logps/accuracies": 0.8125, "logps/chosen": -260.0171203613281, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -251.60545349121094, "logps/ref_rejected": -259.59515380859375, "logps/rejected": -315.2181701660156, "loss": 0.6161, "rewards/accuracies": 0.875, "rewards/chosen": -0.8411648869514465, "rewards/grad_term": 0.021061977371573448, "rewards/margins": 4.721133232116699, "rewards/rejected": -5.56229829788208, "step": 115 }, { "epoch": 0.2406326980422663, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 31.171420280298065, "learning_rate": 9.792387543252594e-07, "logits/chosen": 0.23254762589931488, "logits/rejected": 0.2675570845603943, "logps/accuracies": 0.9375, "logps/chosen": -289.489501953125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -289.00616455078125, "logps/ref_rejected": -302.8209533691406, "logps/rejected": -371.5626220703125, "loss": 0.5818, "rewards/accuracies": 0.9375, "rewards/chosen": -0.048332199454307556, "rewards/grad_term": 0.007328622043132782, "rewards/margins": 6.825834274291992, "rewards/rejected": -6.874166488647461, "step": 116 }, { "epoch": 0.2427071178529755, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 25.865643815270218, "learning_rate": 9.780853517877738e-07, "logits/chosen": 0.5108106136322021, "logits/rejected": 0.5345089435577393, "logps/accuracies": 0.875, "logps/chosen": -284.67791748046875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -285.08941650390625, "logps/ref_rejected": -308.15838623046875, "logps/rejected": -370.12554931640625, "loss": 0.5606, "rewards/accuracies": 1.0, "rewards/chosen": 0.04114929586648941, "rewards/grad_term": 0.009833071380853653, "rewards/margins": 6.237868785858154, "rewards/rejected": -6.196719646453857, "step": 117 }, { "epoch": 0.24478153766368468, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 23.22657914288563, "learning_rate": 9.769319492502884e-07, "logits/chosen": 0.23898278176784515, "logits/rejected": 0.2838956415653229, "logps/accuracies": 0.9375, "logps/chosen": -317.9300231933594, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -322.7581787109375, "logps/ref_rejected": -333.2860107421875, "logps/rejected": -404.1823425292969, "loss": 0.5433, "rewards/accuracies": 1.0, "rewards/chosen": 0.4828159809112549, "rewards/grad_term": 0.002703046426177025, "rewards/margins": 7.572445869445801, "rewards/rejected": -7.089630126953125, "step": 118 }, { "epoch": 0.24685595747439387, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 20.20109483182801, "learning_rate": 9.757785467128027e-07, "logits/chosen": 0.6895065307617188, "logits/rejected": 0.7345404624938965, "logps/accuracies": 0.8125, "logps/chosen": -298.1529846191406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -294.2225341796875, "logps/ref_rejected": -282.6121520996094, "logps/rejected": -337.44476318359375, "loss": 0.5809, "rewards/accuracies": 0.875, "rewards/chosen": -0.3930422067642212, "rewards/grad_term": 0.017846597358584404, "rewards/margins": 5.090217113494873, "rewards/rejected": -5.483259677886963, "step": 119 }, { "epoch": 0.24893037728510306, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 33.31398261005609, "learning_rate": 9.74625144175317e-07, "logits/chosen": 0.37160423398017883, "logits/rejected": 0.3335186839103699, "logps/accuracies": 0.75, "logps/chosen": -276.2862548828125, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -279.1397399902344, "logps/ref_rejected": -279.9727478027344, "logps/rejected": -327.0055847167969, "loss": 0.62, "rewards/accuracies": 0.8125, "rewards/chosen": 0.28534770011901855, "rewards/grad_term": 0.01826310157775879, "rewards/margins": 4.9886322021484375, "rewards/rejected": -4.70328426361084, "step": 120 }, { "epoch": 0.25100479709581225, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.4375, "grad_norm": 24.34104219676861, "learning_rate": 9.734717416378314e-07, "logits/chosen": 0.47060269117355347, "logits/rejected": 0.533828854560852, "logps/accuracies": 0.5625, "logps/chosen": -250.86029052734375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -249.06607055664062, "logps/ref_rejected": -266.4825439453125, "logps/rejected": -304.35400390625, "loss": 0.6245, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17942330241203308, "rewards/grad_term": 0.023510945960879326, "rewards/margins": 3.6077194213867188, "rewards/rejected": -3.787142515182495, "step": 121 }, { "epoch": 0.25307921690652146, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.375, "grad_norm": 32.38649249036862, "learning_rate": 9.72318339100346e-07, "logits/chosen": 0.058825843036174774, "logits/rejected": 0.1310182362794876, "logps/accuracies": 0.625, "logps/chosen": -307.8884582519531, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -298.2418212890625, "logps/ref_rejected": -289.80157470703125, "logps/rejected": -332.2846984863281, "loss": 0.6421, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9646634459495544, "rewards/grad_term": 0.028044363483786583, "rewards/margins": 3.283651828765869, "rewards/rejected": -4.248315334320068, "step": 122 }, { "epoch": 0.2551536367172306, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 25.328252823752003, "learning_rate": 9.711649365628604e-07, "logits/chosen": 0.4595690667629242, "logits/rejected": 0.4828678071498871, "logps/accuracies": 0.8125, "logps/chosen": -319.6042785644531, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -314.05908203125, "logps/ref_rejected": -309.8699645996094, "logps/rejected": -373.5873718261719, "loss": 0.638, "rewards/accuracies": 1.0, "rewards/chosen": -0.5545214414596558, "rewards/grad_term": 0.0088451923802495, "rewards/margins": 5.817216396331787, "rewards/rejected": -6.371737480163574, "step": 123 }, { "epoch": 0.25722805652793984, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 58.14409414314697, "learning_rate": 9.70011534025375e-07, "logits/chosen": 0.16532814502716064, "logits/rejected": 0.1864890158176422, "logps/accuracies": 0.6875, "logps/chosen": -322.5909729003906, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -328.1080017089844, "logps/ref_rejected": -314.7974548339844, "logps/rejected": -369.16583251953125, "loss": 0.6333, "rewards/accuracies": 1.0, "rewards/chosen": 0.551704466342926, "rewards/grad_term": 0.014467663131654263, "rewards/margins": 5.9885406494140625, "rewards/rejected": -5.4368367195129395, "step": 124 }, { "epoch": 0.25930247633864906, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 29.577490597996505, "learning_rate": 9.688581314878893e-07, "logits/chosen": 0.27106067538261414, "logits/rejected": 0.28159230947494507, "logps/accuracies": 0.875, "logps/chosen": -324.951904296875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -315.4916687011719, "logps/ref_rejected": -310.50274658203125, "logps/rejected": -384.7645568847656, "loss": 0.6228, "rewards/accuracies": 0.875, "rewards/chosen": -0.9460303783416748, "rewards/grad_term": 0.01042198482900858, "rewards/margins": 6.480146884918213, "rewards/rejected": -7.426177024841309, "step": 125 }, { "epoch": 0.2613768961493582, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 28.275175168662855, "learning_rate": 9.677047289504036e-07, "logits/chosen": 0.16482499241828918, "logits/rejected": 0.13334128260612488, "logps/accuracies": 0.8125, "logps/chosen": -395.91998291015625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -403.248046875, "logps/ref_rejected": -382.96343994140625, "logps/rejected": -467.2958679199219, "loss": 0.586, "rewards/accuracies": 1.0, "rewards/chosen": 0.7328065633773804, "rewards/grad_term": 0.002520698821172118, "rewards/margins": 9.166044235229492, "rewards/rejected": -8.43323802947998, "step": 126 }, { "epoch": 0.26345131596006743, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 31.309734628094976, "learning_rate": 9.66551326412918e-07, "logits/chosen": 0.07952776551246643, "logits/rejected": 0.1613186150789261, "logps/accuracies": 0.875, "logps/chosen": -320.70037841796875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -327.677490234375, "logps/ref_rejected": -339.91748046875, "logps/rejected": -401.8999328613281, "loss": 0.6011, "rewards/accuracies": 0.875, "rewards/chosen": 0.6977108120918274, "rewards/grad_term": 0.010999541729688644, "rewards/margins": 6.89595890045166, "rewards/rejected": -6.198247909545898, "step": 127 }, { "epoch": 0.2655257357707766, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 31.741083253136384, "learning_rate": 9.653979238754326e-07, "logits/chosen": 0.36669662594795227, "logits/rejected": 0.40633296966552734, "logps/accuracies": 0.8125, "logps/chosen": -352.07159423828125, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -343.3804016113281, "logps/ref_rejected": -353.6275939941406, "logps/rejected": -414.04425048828125, "loss": 0.6335, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8691204190254211, "rewards/grad_term": 0.015956774353981018, "rewards/margins": 5.172546863555908, "rewards/rejected": -6.0416669845581055, "step": 128 }, { "epoch": 0.2655257357707766, "eval_flips/correct->correct": 0.4236453175544739, "eval_flips/correct->incorrect": 0.019704433158040047, "eval_flips/incorrect->correct": 0.3300492465496063, "eval_flips/incorrect->incorrect": 0.2266009896993637, "eval_logits/chosen": 0.3016127645969391, "eval_logits/rejected": 0.34773820638656616, "eval_logps/accuracies": 0.7536945939064026, "eval_logps/chosen": -294.51837158203125, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -349.0025329589844, "eval_loss": 0.6313375234603882, "eval_rewards/accuracies": 0.8866994976997375, "eval_rewards/chosen": -0.7167255878448486, "eval_rewards/grad_term": 0.016497639939188957, "eval_rewards/margins": 5.278923511505127, "eval_rewards/rejected": -5.995649337768555, "eval_runtime": 785.8607, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.258, "step": 128 }, { "epoch": 0.2676001555814858, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 35.58302765361098, "learning_rate": 9.64244521337947e-07, "logits/chosen": 0.3390696048736572, "logits/rejected": 0.3560726046562195, "logps/accuracies": 0.8125, "logps/chosen": -322.2768249511719, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -310.5375061035156, "logps/ref_rejected": -317.2485046386719, "logps/rejected": -383.12200927734375, "loss": 0.6209, "rewards/accuracies": 0.875, "rewards/chosen": -1.1739336252212524, "rewards/grad_term": 0.013801316730678082, "rewards/margins": 5.413419246673584, "rewards/rejected": -6.5873517990112305, "step": 129 }, { "epoch": 0.26967457539219497, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.375, "grad_norm": 29.188493789977155, "learning_rate": 9.630911188004613e-07, "logits/chosen": 0.4089130163192749, "logits/rejected": 0.3992210626602173, "logps/accuracies": 0.625, "logps/chosen": -246.3241729736328, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -250.2421417236328, "logps/ref_rejected": -233.44342041015625, "logps/rejected": -285.1191711425781, "loss": 0.6077, "rewards/accuracies": 0.875, "rewards/chosen": 0.391795814037323, "rewards/grad_term": 0.014157270081341267, "rewards/margins": 5.559370040893555, "rewards/rejected": -5.167574405670166, "step": 130 }, { "epoch": 0.2717489952029042, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 36.02900591013536, "learning_rate": 9.619377162629756e-07, "logits/chosen": 0.32642504572868347, "logits/rejected": 0.34259384870529175, "logps/accuracies": 0.875, "logps/chosen": -331.6784973144531, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -329.0844421386719, "logps/ref_rejected": -341.3873596191406, "logps/rejected": -407.9965515136719, "loss": 0.632, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2594057619571686, "rewards/grad_term": 0.006255139596760273, "rewards/margins": 6.4015092849731445, "rewards/rejected": -6.660915374755859, "step": 131 }, { "epoch": 0.2738234150136134, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 24.659869882606102, "learning_rate": 9.607843137254902e-07, "logits/chosen": 0.25219637155532837, "logits/rejected": 0.21175454556941986, "logps/accuracies": 0.6875, "logps/chosen": -320.4070129394531, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -320.45111083984375, "logps/ref_rejected": -291.7745056152344, "logps/rejected": -371.3065490722656, "loss": 0.5938, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0044078826904296875, "rewards/grad_term": 0.007969305850565434, "rewards/margins": 7.957607269287109, "rewards/rejected": -7.953199863433838, "step": 132 }, { "epoch": 0.27589783482432256, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 66.69910712869152, "learning_rate": 9.596309111880046e-07, "logits/chosen": 0.4336986243724823, "logits/rejected": 0.4323787987232208, "logps/accuracies": 0.6875, "logps/chosen": -302.4508361816406, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -285.8231201171875, "logps/ref_rejected": -284.0436706542969, "logps/rejected": -342.44122314453125, "loss": 0.6008, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6627693176269531, "rewards/grad_term": 0.020465871319174767, "rewards/margins": 4.176986217498779, "rewards/rejected": -5.839755535125732, "step": 133 }, { "epoch": 0.2779722546350318, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 35.23241127542349, "learning_rate": 9.58477508650519e-07, "logits/chosen": 0.502811074256897, "logits/rejected": 0.5239925980567932, "logps/accuracies": 0.75, "logps/chosen": -317.7173156738281, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -317.0647888183594, "logps/ref_rejected": -297.7698669433594, "logps/rejected": -358.817626953125, "loss": 0.6362, "rewards/accuracies": 1.0, "rewards/chosen": -0.06524688005447388, "rewards/grad_term": 0.008810807019472122, "rewards/margins": 6.039529800415039, "rewards/rejected": -6.1047773361206055, "step": 134 }, { "epoch": 0.28004667444574094, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 58.595789612615945, "learning_rate": 9.573241061130333e-07, "logits/chosen": 0.36474430561065674, "logits/rejected": 0.35197287797927856, "logps/accuracies": 0.75, "logps/chosen": -326.8201904296875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -325.84783935546875, "logps/ref_rejected": -326.2251892089844, "logps/rejected": -372.5774841308594, "loss": 0.6008, "rewards/accuracies": 0.875, "rewards/chosen": -0.09723645448684692, "rewards/grad_term": 0.020627174526453018, "rewards/margins": 4.537996768951416, "rewards/rejected": -4.6352338790893555, "step": 135 }, { "epoch": 0.28212109425645016, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 25.14456310024547, "learning_rate": 9.561707035755479e-07, "logits/chosen": 0.26421457529067993, "logits/rejected": 0.33900099992752075, "logps/accuracies": 0.875, "logps/chosen": -260.8269348144531, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -258.1986999511719, "logps/ref_rejected": -283.28680419921875, "logps/rejected": -322.0076904296875, "loss": 0.6207, "rewards/accuracies": 0.75, "rewards/chosen": -0.26282617449760437, "rewards/grad_term": 0.02707597427070141, "rewards/margins": 3.609261989593506, "rewards/rejected": -3.8720884323120117, "step": 136 }, { "epoch": 0.2841955140671593, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.4375, "grad_norm": 42.8867860973982, "learning_rate": 9.550173010380622e-07, "logits/chosen": 0.09971302002668381, "logits/rejected": 0.12542912364006042, "logps/accuracies": 0.5625, "logps/chosen": -322.331787109375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -321.5882873535156, "logps/ref_rejected": -315.2976379394531, "logps/rejected": -352.12066650390625, "loss": 0.6771, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07434892654418945, "rewards/grad_term": 0.01898660883307457, "rewards/margins": 3.607954978942871, "rewards/rejected": -3.6823039054870605, "step": 137 }, { "epoch": 0.28626993387786853, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 67.6595700226082, "learning_rate": 9.538638985005768e-07, "logits/chosen": 0.23438116908073425, "logits/rejected": 0.3342619240283966, "logps/accuracies": 0.8125, "logps/chosen": -300.3291015625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -296.4096984863281, "logps/ref_rejected": -310.85064697265625, "logps/rejected": -363.2022399902344, "loss": 0.6381, "rewards/accuracies": 0.875, "rewards/chosen": -0.39193806052207947, "rewards/grad_term": 0.017197635024785995, "rewards/margins": 4.84321928024292, "rewards/rejected": -5.235157489776611, "step": 138 }, { "epoch": 0.28834435368857775, "flips/correct->correct": 0.1875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.375, "grad_norm": 46.44298957827346, "learning_rate": 9.52710495963091e-07, "logits/chosen": 0.1467391550540924, "logits/rejected": 0.11830101907253265, "logps/accuracies": 0.625, "logps/chosen": -321.1262512207031, "logps/ref_accuracies": 0.1875, "logps/ref_chosen": -325.08428955078125, "logps/ref_rejected": -281.5567321777344, "logps/rejected": -340.51849365234375, "loss": 0.6354, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3958081305027008, "rewards/grad_term": 0.013231747783720493, "rewards/margins": 6.291983604431152, "rewards/rejected": -5.896175384521484, "step": 139 }, { "epoch": 0.2904187734992869, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 36.89267412980749, "learning_rate": 9.515570934256055e-07, "logits/chosen": 0.3458084762096405, "logits/rejected": 0.37101224064826965, "logps/accuracies": 0.875, "logps/chosen": -278.9690246582031, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -275.2301940917969, "logps/ref_rejected": -293.14935302734375, "logps/rejected": -343.7322998046875, "loss": 0.6516, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37388384342193604, "rewards/grad_term": 0.020328430458903313, "rewards/margins": 4.684409141540527, "rewards/rejected": -5.058292865753174, "step": 140 }, { "epoch": 0.2924931933099961, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.3125, "grad_norm": 29.520183041657447, "learning_rate": 9.504036908881198e-07, "logits/chosen": 0.2588088810443878, "logits/rejected": 0.35400643944740295, "logps/accuracies": 0.6875, "logps/chosen": -329.9474182128906, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -309.6986999511719, "logps/ref_rejected": -333.968994140625, "logps/rejected": -376.42169189453125, "loss": 0.6512, "rewards/accuracies": 0.6875, "rewards/chosen": -2.024869918823242, "rewards/grad_term": 0.03422696888446808, "rewards/margins": 2.220407247543335, "rewards/rejected": -4.24527645111084, "step": 141 }, { "epoch": 0.2945676131207053, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 20.090341392606742, "learning_rate": 9.492502883506344e-07, "logits/chosen": 0.17465892434120178, "logits/rejected": 0.19804833829402924, "logps/accuracies": 0.6875, "logps/chosen": -313.5783386230469, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -307.25164794921875, "logps/ref_rejected": -290.28948974609375, "logps/rejected": -353.5581359863281, "loss": 0.5813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6326678991317749, "rewards/grad_term": 0.017216186970472336, "rewards/margins": 5.694197177886963, "rewards/rejected": -6.3268656730651855, "step": 142 }, { "epoch": 0.2966420329314145, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 38.2122061812, "learning_rate": 9.480968858131488e-07, "logits/chosen": 0.28784969449043274, "logits/rejected": 0.38434553146362305, "logps/accuracies": 0.8125, "logps/chosen": -337.89697265625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -307.4181823730469, "logps/ref_rejected": -347.0847473144531, "logps/rejected": -420.6956787109375, "loss": 0.5784, "rewards/accuracies": 0.875, "rewards/chosen": -3.047877788543701, "rewards/grad_term": 0.021118801087141037, "rewards/margins": 4.3132147789001465, "rewards/rejected": -7.361092567443848, "step": 143 }, { "epoch": 0.29871645274212366, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0, "grad_norm": 37.993499093491, "learning_rate": 9.469434832756632e-07, "logits/chosen": 0.3818073570728302, "logits/rejected": 0.4472813010215759, "logps/accuracies": 1.0, "logps/chosen": -309.6822509765625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -295.15576171875, "logps/ref_rejected": -316.4786071777344, "logps/rejected": -395.452392578125, "loss": 0.6367, "rewards/accuracies": 1.0, "rewards/chosen": -1.4526524543762207, "rewards/grad_term": 0.005610838998109102, "rewards/margins": 6.4447221755981445, "rewards/rejected": -7.897374629974365, "step": 144 }, { "epoch": 0.3007908725528329, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 54.21342872299693, "learning_rate": 9.457900807381776e-07, "logits/chosen": 0.10580252856016159, "logits/rejected": 0.1295485496520996, "logps/accuracies": 0.875, "logps/chosen": -312.27178955078125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -306.3640441894531, "logps/ref_rejected": -344.1884765625, "logps/rejected": -413.30572509765625, "loss": 0.5971, "rewards/accuracies": 0.875, "rewards/chosen": -0.5907725095748901, "rewards/grad_term": 0.014945675618946552, "rewards/margins": 6.320951461791992, "rewards/rejected": -6.911723613739014, "step": 145 }, { "epoch": 0.3028652923635421, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 49.62895603709547, "learning_rate": 9.446366782006921e-07, "logits/chosen": 0.496852844953537, "logits/rejected": 0.49739354848861694, "logps/accuracies": 0.75, "logps/chosen": -254.4556121826172, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -253.5081787109375, "logps/ref_rejected": -243.6640625, "logps/rejected": -306.6009826660156, "loss": 0.6307, "rewards/accuracies": 0.875, "rewards/chosen": -0.09474316239356995, "rewards/grad_term": 0.014840014278888702, "rewards/margins": 6.198947906494141, "rewards/rejected": -6.2936906814575195, "step": 146 }, { "epoch": 0.30493971217425125, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 33.16438726233068, "learning_rate": 9.434832756632064e-07, "logits/chosen": 0.3227022588253021, "logits/rejected": 0.29622456431388855, "logps/accuracies": 0.9375, "logps/chosen": -315.11383056640625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -323.7686462402344, "logps/ref_rejected": -318.28631591796875, "logps/rejected": -397.4606628417969, "loss": 0.598, "rewards/accuracies": 1.0, "rewards/chosen": 0.865482747554779, "rewards/grad_term": 0.0047454568557441235, "rewards/margins": 8.782920837402344, "rewards/rejected": -7.917438507080078, "step": 147 }, { "epoch": 0.30701413198496047, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 32.98244501476046, "learning_rate": 9.423298731257209e-07, "logits/chosen": 0.26352736353874207, "logits/rejected": 0.2875834107398987, "logps/accuracies": 0.6875, "logps/chosen": -268.8025817871094, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -260.4774169921875, "logps/ref_rejected": -263.6733703613281, "logps/rejected": -314.69268798828125, "loss": 0.6464, "rewards/accuracies": 0.75, "rewards/chosen": -0.8325148820877075, "rewards/grad_term": 0.02254444733262062, "rewards/margins": 4.269417762756348, "rewards/rejected": -5.101933002471924, "step": 148 }, { "epoch": 0.30908855179566963, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 33.32776229173226, "learning_rate": 9.411764705882352e-07, "logits/chosen": 0.11156149208545685, "logits/rejected": 0.24737051129341125, "logps/accuracies": 0.75, "logps/chosen": -303.48388671875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -298.459228515625, "logps/ref_rejected": -314.2269592285156, "logps/rejected": -370.69622802734375, "loss": 0.5583, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5024658441543579, "rewards/grad_term": 0.022268792614340782, "rewards/margins": 5.144461631774902, "rewards/rejected": -5.646927356719971, "step": 149 }, { "epoch": 0.31116297160637885, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.375, "grad_norm": 21.82859690680754, "learning_rate": 9.400230680507497e-07, "logits/chosen": 0.29909923672676086, "logits/rejected": 0.33298757672309875, "logps/accuracies": 0.625, "logps/chosen": -245.38111877441406, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -240.87034606933594, "logps/ref_rejected": -240.8376922607422, "logps/rejected": -289.25067138671875, "loss": 0.5837, "rewards/accuracies": 0.8125, "rewards/chosen": -0.45107802748680115, "rewards/grad_term": 0.022424593567848206, "rewards/margins": 4.390218257904053, "rewards/rejected": -4.841296195983887, "step": 150 }, { "epoch": 0.313237391417088, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 32.35074274110825, "learning_rate": 9.38869665513264e-07, "logits/chosen": 0.1612250953912735, "logits/rejected": 0.15677325427532196, "logps/accuracies": 0.8125, "logps/chosen": -291.77081298828125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -292.1325988769531, "logps/ref_rejected": -287.9341735839844, "logps/rejected": -358.65057373046875, "loss": 0.6135, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03617708384990692, "rewards/grad_term": 0.011941466480493546, "rewards/margins": 7.107817649841309, "rewards/rejected": -7.071640968322754, "step": 151 }, { "epoch": 0.3153118112277972, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 42.437628926037625, "learning_rate": 9.377162629757785e-07, "logits/chosen": 0.21432383358478546, "logits/rejected": 0.2382117211818695, "logps/accuracies": 0.9375, "logps/chosen": -270.55712890625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -275.3125305175781, "logps/ref_rejected": -276.75384521484375, "logps/rejected": -355.29779052734375, "loss": 0.5223, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4755399823188782, "rewards/grad_term": 0.004575583152472973, "rewards/margins": 8.329938888549805, "rewards/rejected": -7.854398250579834, "step": 152 }, { "epoch": 0.31738623103850644, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.125, "grad_norm": 41.66344734527698, "learning_rate": 9.365628604382929e-07, "logits/chosen": -0.07189223915338516, "logits/rejected": -0.08959042280912399, "logps/accuracies": 0.875, "logps/chosen": -328.5618896484375, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -327.5036926269531, "logps/ref_rejected": -321.0348205566406, "logps/rejected": -393.354248046875, "loss": 0.5755, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10582125186920166, "rewards/grad_term": 0.014492910355329514, "rewards/margins": 7.126119613647461, "rewards/rejected": -7.231941223144531, "step": 153 }, { "epoch": 0.3194606508492156, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 29.74794689137638, "learning_rate": 9.354094579008073e-07, "logits/chosen": 0.35974666476249695, "logits/rejected": 0.3688337206840515, "logps/accuracies": 0.8125, "logps/chosen": -313.2078857421875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -305.1917419433594, "logps/ref_rejected": -320.38128662109375, "logps/rejected": -390.50933837890625, "loss": 0.5928, "rewards/accuracies": 1.0, "rewards/chosen": -0.8016154766082764, "rewards/grad_term": 0.008705828338861465, "rewards/margins": 6.2111945152282715, "rewards/rejected": -7.012809753417969, "step": 154 }, { "epoch": 0.3215350706599248, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 28.687378270489237, "learning_rate": 9.342560553633218e-07, "logits/chosen": 0.18269102275371552, "logits/rejected": 0.1776474416255951, "logps/accuracies": 0.875, "logps/chosen": -275.4834289550781, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -272.5428771972656, "logps/ref_rejected": -261.2928161621094, "logps/rejected": -336.526611328125, "loss": 0.5897, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2940564453601837, "rewards/grad_term": 0.006705356761813164, "rewards/margins": 7.2293267250061035, "rewards/rejected": -7.523382186889648, "step": 155 }, { "epoch": 0.323609490470634, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 36.335486366302646, "learning_rate": 9.331026528258363e-07, "logits/chosen": 0.07444247603416443, "logits/rejected": 0.20133280754089355, "logps/accuracies": 0.75, "logps/chosen": -329.2906188964844, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -316.47515869140625, "logps/ref_rejected": -332.9421081542969, "logps/rejected": -394.68560791015625, "loss": 0.5576, "rewards/accuracies": 0.875, "rewards/chosen": -1.2815489768981934, "rewards/grad_term": 0.0174331646412611, "rewards/margins": 4.892797946929932, "rewards/rejected": -6.174346923828125, "step": 156 }, { "epoch": 0.3256839102813432, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 27.29416495658281, "learning_rate": 9.319492502883506e-07, "logits/chosen": 0.4256312847137451, "logits/rejected": 0.4740726053714752, "logps/accuracies": 0.9375, "logps/chosen": -321.6173095703125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -303.39617919921875, "logps/ref_rejected": -309.72113037109375, "logps/rejected": -375.21429443359375, "loss": 0.5708, "rewards/accuracies": 0.9375, "rewards/chosen": -1.822115182876587, "rewards/grad_term": 0.014289310202002525, "rewards/margins": 4.727199077606201, "rewards/rejected": -6.549314022064209, "step": 157 }, { "epoch": 0.32775833009205235, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 32.223889847251904, "learning_rate": 9.307958477508651e-07, "logits/chosen": 0.3213425576686859, "logits/rejected": 0.35512280464172363, "logps/accuracies": 0.8125, "logps/chosen": -299.30364990234375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -305.61181640625, "logps/ref_rejected": -306.4347839355469, "logps/rejected": -362.4326171875, "loss": 0.573, "rewards/accuracies": 1.0, "rewards/chosen": 0.6308162212371826, "rewards/grad_term": 0.008797680027782917, "rewards/margins": 6.230600357055664, "rewards/rejected": -5.5997843742370605, "step": 158 }, { "epoch": 0.32983274990276157, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0, "grad_norm": 42.213863271098234, "learning_rate": 9.296424452133794e-07, "logits/chosen": 0.377382755279541, "logits/rejected": 0.456988126039505, "logps/accuracies": 1.0, "logps/chosen": -288.3867492675781, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -293.0536804199219, "logps/ref_rejected": -317.74163818359375, "logps/rejected": -391.38153076171875, "loss": 0.546, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4666934609413147, "rewards/grad_term": 0.0034745843149721622, "rewards/margins": 7.8306803703308105, "rewards/rejected": -7.363986968994141, "step": 159 }, { "epoch": 0.3319071697134708, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 71.4121450161779, "learning_rate": 9.284890426758939e-07, "logits/chosen": 0.31267380714416504, "logits/rejected": 0.33250027894973755, "logps/accuracies": 0.75, "logps/chosen": -327.6630554199219, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -322.4392395019531, "logps/ref_rejected": -329.0008544921875, "logps/rejected": -386.5987548828125, "loss": 0.6503, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5223828554153442, "rewards/grad_term": 0.015317104756832123, "rewards/margins": 5.23740291595459, "rewards/rejected": -5.759785175323486, "step": 160 }, { "epoch": 0.33398158952417994, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 90.86602072589874, "learning_rate": 9.273356401384083e-07, "logits/chosen": 0.16347447037696838, "logits/rejected": 0.2435542643070221, "logps/accuracies": 0.8125, "logps/chosen": -309.9219665527344, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -309.3226013183594, "logps/ref_rejected": -317.44964599609375, "logps/rejected": -393.472900390625, "loss": 0.5695, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05993741750717163, "rewards/grad_term": 0.009122053161263466, "rewards/margins": 7.542388916015625, "rewards/rejected": -7.602326393127441, "step": 161 }, { "epoch": 0.33605600933488916, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 63.38287747184822, "learning_rate": 9.261822376009227e-07, "logits/chosen": 0.03160097077488899, "logits/rejected": 0.15727761387825012, "logps/accuracies": 0.75, "logps/chosen": -304.5788269042969, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -295.0198669433594, "logps/ref_rejected": -315.7266540527344, "logps/rejected": -362.4599304199219, "loss": 0.669, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9558972716331482, "rewards/grad_term": 0.02149307169020176, "rewards/margins": 3.7174317836761475, "rewards/rejected": -4.673328876495361, "step": 162 }, { "epoch": 0.3381304291455983, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 37.24982276954982, "learning_rate": 9.250288350634371e-07, "logits/chosen": 0.24977634847164154, "logits/rejected": 0.2465619146823883, "logps/accuracies": 0.875, "logps/chosen": -300.94000244140625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -296.2314453125, "logps/ref_rejected": -290.3169250488281, "logps/rejected": -364.783447265625, "loss": 0.5658, "rewards/accuracies": 0.9375, "rewards/chosen": -0.47085797786712646, "rewards/grad_term": 0.00792708620429039, "rewards/margins": 6.97579288482666, "rewards/rejected": -7.446650981903076, "step": 163 }, { "epoch": 0.34020484895630754, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 41.17406855028247, "learning_rate": 9.238754325259515e-07, "logits/chosen": 0.27946552634239197, "logits/rejected": 0.2818312346935272, "logps/accuracies": 0.75, "logps/chosen": -334.7640075683594, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -326.3661804199219, "logps/ref_rejected": -322.5880126953125, "logps/rejected": -379.8989562988281, "loss": 0.5503, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8397865295410156, "rewards/grad_term": 0.01920832134783268, "rewards/margins": 4.89130973815918, "rewards/rejected": -5.731095790863037, "step": 164 }, { "epoch": 0.3422792687670167, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 30.140573016986096, "learning_rate": 9.227220299884659e-07, "logits/chosen": 0.106672503054142, "logits/rejected": 0.20751769840717316, "logps/accuracies": 0.75, "logps/chosen": -289.3790283203125, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -279.317138671875, "logps/ref_rejected": -285.1666564941406, "logps/rejected": -346.164794921875, "loss": 0.5963, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0061873197555542, "rewards/grad_term": 0.0124615877866745, "rewards/margins": 5.093625068664551, "rewards/rejected": -6.099812030792236, "step": 165 }, { "epoch": 0.3443536885777259, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 33.13491495817512, "learning_rate": 9.215686274509803e-07, "logits/chosen": 0.49837812781333923, "logits/rejected": 0.5220686793327332, "logps/accuracies": 0.8125, "logps/chosen": -289.36871337890625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -290.96624755859375, "logps/ref_rejected": -274.1662292480469, "logps/rejected": -336.1552734375, "loss": 0.5496, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15975357592105865, "rewards/grad_term": 0.012641198933124542, "rewards/margins": 6.358658790588379, "rewards/rejected": -6.1989054679870605, "step": 166 }, { "epoch": 0.34642810838843513, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.1875, "grad_norm": 54.11732274527683, "learning_rate": 9.204152249134947e-07, "logits/chosen": 0.015494227409362793, "logits/rejected": 0.016678210347890854, "logps/accuracies": 0.8125, "logps/chosen": -325.68914794921875, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -331.82000732421875, "logps/ref_rejected": -319.7809143066406, "logps/rejected": -382.5654296875, "loss": 0.5618, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6130860447883606, "rewards/grad_term": 0.009663441218435764, "rewards/margins": 6.891541004180908, "rewards/rejected": -6.278454780578613, "step": 167 }, { "epoch": 0.3485025281991443, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 35.17938428087587, "learning_rate": 9.192618223760092e-07, "logits/chosen": 0.1498590111732483, "logits/rejected": 0.03451567143201828, "logps/accuracies": 0.75, "logps/chosen": -318.3073425292969, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -324.92156982421875, "logps/ref_rejected": -279.1431884765625, "logps/rejected": -343.2935791015625, "loss": 0.5567, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6614212989807129, "rewards/grad_term": 0.004435483831912279, "rewards/margins": 7.076463222503662, "rewards/rejected": -6.415041923522949, "step": 168 }, { "epoch": 0.3505769480098535, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.4375, "grad_norm": 29.79744425397875, "learning_rate": 9.181084198385236e-07, "logits/chosen": 0.4352983832359314, "logits/rejected": 0.42166027426719666, "logps/accuracies": 0.5625, "logps/chosen": -218.41848754882812, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -218.35841369628906, "logps/ref_rejected": -203.54812622070312, "logps/rejected": -244.30548095703125, "loss": 0.6235, "rewards/accuracies": 0.75, "rewards/chosen": -0.0060057491064071655, "rewards/grad_term": 0.020719772204756737, "rewards/margins": 4.069727897644043, "rewards/rejected": -4.0757341384887695, "step": 169 }, { "epoch": 0.35265136782056267, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 37.443318352494714, "learning_rate": 9.16955017301038e-07, "logits/chosen": 0.34023189544677734, "logits/rejected": 0.36414065957069397, "logps/accuracies": 0.5625, "logps/chosen": -341.10162353515625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -348.6609191894531, "logps/ref_rejected": -313.69305419921875, "logps/rejected": -365.22552490234375, "loss": 0.5449, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7559297680854797, "rewards/grad_term": 0.012427425011992455, "rewards/margins": 5.909174919128418, "rewards/rejected": -5.153245449066162, "step": 170 }, { "epoch": 0.3547257876312719, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 38.974124690109655, "learning_rate": 9.158016147635525e-07, "logits/chosen": 0.2296074777841568, "logits/rejected": 0.22281108796596527, "logps/accuracies": 0.75, "logps/chosen": -266.399658203125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -266.3810729980469, "logps/ref_rejected": -264.22479248046875, "logps/rejected": -312.05328369140625, "loss": 0.5639, "rewards/accuracies": 0.875, "rewards/chosen": -0.00185690401121974, "rewards/grad_term": 0.021189574152231216, "rewards/margins": 4.780992031097412, "rewards/rejected": -4.782848358154297, "step": 171 }, { "epoch": 0.35680020744198104, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 40.615225671584604, "learning_rate": 9.146482122260668e-07, "logits/chosen": -0.058992840349674225, "logits/rejected": 0.13406533002853394, "logps/accuracies": 0.8125, "logps/chosen": -248.48712158203125, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -245.46426391601562, "logps/ref_rejected": -322.3562316894531, "logps/rejected": -359.47808837890625, "loss": 0.6297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3022858798503876, "rewards/grad_term": 0.02431631274521351, "rewards/margins": 3.4098992347717285, "rewards/rejected": -3.7121849060058594, "step": 172 }, { "epoch": 0.35887462725269026, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 48.38748944944743, "learning_rate": 9.134948096885813e-07, "logits/chosen": 0.17907698452472687, "logits/rejected": 0.2532532811164856, "logps/accuracies": 0.75, "logps/chosen": -268.450927734375, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -272.5251770019531, "logps/ref_rejected": -259.0370178222656, "logps/rejected": -306.13800048828125, "loss": 0.6268, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40742844343185425, "rewards/grad_term": 0.01490036677569151, "rewards/margins": 5.117522716522217, "rewards/rejected": -4.710094451904297, "step": 173 }, { "epoch": 0.3609490470633995, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 31.921038664510473, "learning_rate": 9.123414071510956e-07, "logits/chosen": 0.24931451678276062, "logits/rejected": 0.32089927792549133, "logps/accuracies": 0.875, "logps/chosen": -327.949951171875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -325.4226379394531, "logps/ref_rejected": -319.7037353515625, "logps/rejected": -401.667724609375, "loss": 0.5455, "rewards/accuracies": 0.9375, "rewards/chosen": -0.25273361802101135, "rewards/grad_term": 0.007721267640590668, "rewards/margins": 7.943665504455566, "rewards/rejected": -8.196398735046387, "step": 174 }, { "epoch": 0.36302346687410864, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 32.597119731399815, "learning_rate": 9.111880046136101e-07, "logits/chosen": 0.14227242767810822, "logits/rejected": 0.14696185290813446, "logps/accuracies": 0.9375, "logps/chosen": -302.6449890136719, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -304.1968078613281, "logps/ref_rejected": -312.4093322753906, "logps/rejected": -365.310791015625, "loss": 0.5941, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15517938137054443, "rewards/grad_term": 0.017512062564492226, "rewards/margins": 5.4453277587890625, "rewards/rejected": -5.2901482582092285, "step": 175 }, { "epoch": 0.36509788668481785, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 40.38991325463123, "learning_rate": 9.100346020761245e-07, "logits/chosen": 0.4459385275840759, "logits/rejected": 0.48317578434944153, "logps/accuracies": 0.9375, "logps/chosen": -375.50152587890625, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -361.5687255859375, "logps/ref_rejected": -390.73028564453125, "logps/rejected": -472.79315185546875, "loss": 0.6096, "rewards/accuracies": 0.875, "rewards/chosen": -1.3932788372039795, "rewards/grad_term": 0.012904556468129158, "rewards/margins": 6.81300163269043, "rewards/rejected": -8.206280708312988, "step": 176 }, { "epoch": 0.367172306495527, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 40.714548146552936, "learning_rate": 9.088811995386389e-07, "logits/chosen": 0.1182754784822464, "logits/rejected": 0.10860362648963928, "logps/accuracies": 0.8125, "logps/chosen": -277.8997802734375, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -265.0371398925781, "logps/ref_rejected": -266.34906005859375, "logps/rejected": -348.8620300292969, "loss": 0.5457, "rewards/accuracies": 0.9375, "rewards/chosen": -1.286261796951294, "rewards/grad_term": 0.013155965134501457, "rewards/margins": 6.965037822723389, "rewards/rejected": -8.251298904418945, "step": 177 }, { "epoch": 0.36924672630623623, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 23.925847924727726, "learning_rate": 9.077277970011533e-07, "logits/chosen": 0.19493117928504944, "logits/rejected": 0.1747354418039322, "logps/accuracies": 0.6875, "logps/chosen": -253.36224365234375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -253.14541625976562, "logps/ref_rejected": -253.6729736328125, "logps/rejected": -327.2242736816406, "loss": 0.5883, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02168332040309906, "rewards/grad_term": 0.010116681456565857, "rewards/margins": 7.33344841003418, "rewards/rejected": -7.35513162612915, "step": 178 }, { "epoch": 0.37132114611694544, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 41.54057148552365, "learning_rate": 9.065743944636677e-07, "logits/chosen": 0.09981651604175568, "logits/rejected": 0.060021985322237015, "logps/accuracies": 0.8125, "logps/chosen": -319.4670104980469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -305.79840087890625, "logps/ref_rejected": -288.304443359375, "logps/rejected": -342.7301940917969, "loss": 0.6653, "rewards/accuracies": 0.875, "rewards/chosen": -1.3668627738952637, "rewards/grad_term": 0.01719023287296295, "rewards/margins": 4.075715065002441, "rewards/rejected": -5.442577838897705, "step": 179 }, { "epoch": 0.3733955659276546, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 53.53086030452953, "learning_rate": 9.054209919261822e-07, "logits/chosen": 0.32655516266822815, "logits/rejected": 0.4233202338218689, "logps/accuracies": 0.75, "logps/chosen": -231.95205688476562, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -222.77748107910156, "logps/ref_rejected": -249.9750213623047, "logps/rejected": -298.7811584472656, "loss": 0.616, "rewards/accuracies": 0.625, "rewards/chosen": -0.9174575805664062, "rewards/grad_term": 0.03101710043847561, "rewards/margins": 3.9631576538085938, "rewards/rejected": -4.880615234375, "step": 180 }, { "epoch": 0.3754699857383638, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 32.474177233879054, "learning_rate": 9.042675893886967e-07, "logits/chosen": 0.15671122074127197, "logits/rejected": 0.15824642777442932, "logps/accuracies": 0.875, "logps/chosen": -331.437255859375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -329.1837158203125, "logps/ref_rejected": -332.14324951171875, "logps/rejected": -419.30963134765625, "loss": 0.585, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2253548800945282, "rewards/grad_term": 0.006346164736896753, "rewards/margins": 8.491281509399414, "rewards/rejected": -8.716635704040527, "step": 181 }, { "epoch": 0.377544405549073, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 32.49282958416918, "learning_rate": 9.03114186851211e-07, "logits/chosen": 0.18260034918785095, "logits/rejected": 0.14178498089313507, "logps/accuracies": 0.8125, "logps/chosen": -295.148193359375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -288.6500244140625, "logps/ref_rejected": -278.32867431640625, "logps/rejected": -359.5013427734375, "loss": 0.5916, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6498188972473145, "rewards/grad_term": 0.008202875964343548, "rewards/margins": 7.467443466186523, "rewards/rejected": -8.11726188659668, "step": 182 }, { "epoch": 0.3796188253597822, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 29.4740035475264, "learning_rate": 9.019607843137255e-07, "logits/chosen": 0.26660820841789246, "logits/rejected": 0.36798760294914246, "logps/accuracies": 0.75, "logps/chosen": -304.508056640625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -300.42181396484375, "logps/ref_rejected": -271.1709899902344, "logps/rejected": -343.62274169921875, "loss": 0.5723, "rewards/accuracies": 0.875, "rewards/chosen": -0.40862417221069336, "rewards/grad_term": 0.009963629767298698, "rewards/margins": 6.836550712585449, "rewards/rejected": -7.245175361633301, "step": 183 }, { "epoch": 0.38169324517049136, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 39.92370508801508, "learning_rate": 9.008073817762398e-07, "logits/chosen": 0.11014918982982635, "logits/rejected": 0.12970136106014252, "logps/accuracies": 0.875, "logps/chosen": -311.4981689453125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -316.7832336425781, "logps/ref_rejected": -304.4905700683594, "logps/rejected": -379.767333984375, "loss": 0.5724, "rewards/accuracies": 1.0, "rewards/chosen": 0.5285077095031738, "rewards/grad_term": 0.0013775170082226396, "rewards/margins": 8.056180953979492, "rewards/rejected": -7.527673721313477, "step": 184 }, { "epoch": 0.3837676649812006, "flips/correct->correct": 0.1875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.375, "grad_norm": 77.26727734044364, "learning_rate": 8.996539792387543e-07, "logits/chosen": 0.32915353775024414, "logits/rejected": 0.36334753036499023, "logps/accuracies": 0.625, "logps/chosen": -317.26904296875, "logps/ref_accuracies": 0.1875, "logps/ref_chosen": -323.752685546875, "logps/ref_rejected": -283.16058349609375, "logps/rejected": -342.1800537109375, "loss": 0.5702, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6483669281005859, "rewards/grad_term": 0.011216258630156517, "rewards/margins": 6.550315856933594, "rewards/rejected": -5.901949405670166, "step": 185 }, { "epoch": 0.3858420847919098, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.375, "grad_norm": 44.46292399532884, "learning_rate": 8.985005767012687e-07, "logits/chosen": 0.250750333070755, "logits/rejected": 0.2685026228427887, "logps/accuracies": 0.625, "logps/chosen": -308.99078369140625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -318.26324462890625, "logps/ref_rejected": -362.1750793457031, "logps/rejected": -410.8619384765625, "loss": 0.6081, "rewards/accuracies": 1.0, "rewards/chosen": 0.9272449612617493, "rewards/grad_term": 0.01049741543829441, "rewards/margins": 5.795932769775391, "rewards/rejected": -4.868687152862549, "step": 186 }, { "epoch": 0.38791650460261895, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 60.58056206089815, "learning_rate": 8.973471741637831e-07, "logits/chosen": 0.16933155059814453, "logits/rejected": 0.23612166941165924, "logps/accuracies": 0.8125, "logps/chosen": -347.870849609375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -355.09906005859375, "logps/ref_rejected": -377.2695007324219, "logps/rejected": -426.260009765625, "loss": 0.6342, "rewards/accuracies": 0.875, "rewards/chosen": 0.7228207588195801, "rewards/grad_term": 0.016279596835374832, "rewards/margins": 5.6218695640563965, "rewards/rejected": -4.899049282073975, "step": 187 }, { "epoch": 0.38999092441332817, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 38.184653723145104, "learning_rate": 8.961937716262975e-07, "logits/chosen": 0.19383230805397034, "logits/rejected": 0.27799373865127563, "logps/accuracies": 0.875, "logps/chosen": -237.73406982421875, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -243.88479614257812, "logps/ref_rejected": -270.0097961425781, "logps/rejected": -308.8341064453125, "loss": 0.6516, "rewards/accuracies": 0.875, "rewards/chosen": 0.6150741577148438, "rewards/grad_term": 0.016990307718515396, "rewards/margins": 4.497503280639648, "rewards/rejected": -3.8824288845062256, "step": 188 }, { "epoch": 0.3920653442240373, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.25, "grad_norm": 94.03754868556166, "learning_rate": 8.95040369088812e-07, "logits/chosen": 0.18277229368686676, "logits/rejected": 0.32301703095436096, "logps/accuracies": 0.75, "logps/chosen": -297.812744140625, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -297.7760314941406, "logps/ref_rejected": -351.8489990234375, "logps/rejected": -395.95452880859375, "loss": 0.5906, "rewards/accuracies": 0.9375, "rewards/chosen": -0.003670990467071533, "rewards/grad_term": 0.01534755527973175, "rewards/margins": 4.406883716583252, "rewards/rejected": -4.4105544090271, "step": 189 }, { "epoch": 0.39413976403474654, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 36.637550653396346, "learning_rate": 8.938869665513263e-07, "logits/chosen": 0.3476504683494568, "logits/rejected": 0.33958810567855835, "logps/accuracies": 0.8125, "logps/chosen": -227.98374938964844, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -220.88394165039062, "logps/ref_rejected": -224.73675537109375, "logps/rejected": -278.4228515625, "loss": 0.6248, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7099814414978027, "rewards/grad_term": 0.017782405018806458, "rewards/margins": 4.658628463745117, "rewards/rejected": -5.36860990524292, "step": 190 }, { "epoch": 0.3962141838454557, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 34.6180474435158, "learning_rate": 8.927335640138408e-07, "logits/chosen": 0.29524171352386475, "logits/rejected": 0.249376118183136, "logps/accuracies": 0.75, "logps/chosen": -340.69873046875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -323.4943542480469, "logps/ref_rejected": -289.3078308105469, "logps/rejected": -365.9657287597656, "loss": 0.5815, "rewards/accuracies": 0.875, "rewards/chosen": -1.7204382419586182, "rewards/grad_term": 0.016098525375127792, "rewards/margins": 5.945353031158447, "rewards/rejected": -7.6657915115356445, "step": 191 }, { "epoch": 0.3982886036561649, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 38.65115109290824, "learning_rate": 8.915801614763551e-07, "logits/chosen": 0.10618914663791656, "logits/rejected": 0.20010565221309662, "logps/accuracies": 0.8125, "logps/chosen": -255.34237670898438, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -254.41799926757812, "logps/ref_rejected": -255.06790161132812, "logps/rejected": -335.4033203125, "loss": 0.5252, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09243983030319214, "rewards/grad_term": 0.008064459078013897, "rewards/margins": 7.941101551055908, "rewards/rejected": -8.033540725708008, "step": 192 }, { "epoch": 0.3982886036561649, "eval_flips/correct->correct": 0.4334975481033325, "eval_flips/correct->incorrect": 0.009852216579020023, "eval_flips/incorrect->correct": 0.35960590839385986, "eval_flips/incorrect->incorrect": 0.19704432785511017, "eval_logits/chosen": 0.20541736483573914, "eval_logits/rejected": 0.25030994415283203, "eval_logps/accuracies": 0.7931034564971924, "eval_logps/chosen": -291.723388671875, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -350.8753356933594, "eval_loss": 0.6111010313034058, "eval_rewards/accuracies": 0.8620689511299133, "eval_rewards/chosen": -0.4372285008430481, "eval_rewards/grad_term": 0.016007939353585243, "eval_rewards/margins": 5.745702743530273, "eval_rewards/rejected": -6.182931900024414, "eval_runtime": 791.2188, "eval_samples_per_second": 2.045, "eval_steps_per_second": 0.257, "step": 192 }, { "epoch": 0.40036302346687414, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 29.785884187336258, "learning_rate": 8.904267589388697e-07, "logits/chosen": 0.4344290494918823, "logits/rejected": 0.4853968620300293, "logps/accuracies": 0.75, "logps/chosen": -235.33804321289062, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -224.7119903564453, "logps/ref_rejected": -247.52615356445312, "logps/rejected": -317.22064208984375, "loss": 0.5751, "rewards/accuracies": 0.875, "rewards/chosen": -1.0626037120819092, "rewards/grad_term": 0.017159339040517807, "rewards/margins": 5.906847953796387, "rewards/rejected": -6.969451904296875, "step": 193 }, { "epoch": 0.4024374432775833, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 33.823622203078585, "learning_rate": 8.89273356401384e-07, "logits/chosen": -0.00733010470867157, "logits/rejected": 0.039806053042411804, "logps/accuracies": 0.8125, "logps/chosen": -298.87469482421875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -278.5909423828125, "logps/ref_rejected": -274.5445251464844, "logps/rejected": -354.7998352050781, "loss": 0.6322, "rewards/accuracies": 0.9375, "rewards/chosen": -2.028369426727295, "rewards/grad_term": 0.014807065948843956, "rewards/margins": 5.997160911560059, "rewards/rejected": -8.025529861450195, "step": 194 }, { "epoch": 0.4045118630882925, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 71.82502886035329, "learning_rate": 8.881199538638985e-07, "logits/chosen": 0.06032078340649605, "logits/rejected": 0.08065234869718552, "logps/accuracies": 0.8125, "logps/chosen": -305.660888671875, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -292.9607238769531, "logps/ref_rejected": -284.1155090332031, "logps/rejected": -348.9861755371094, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": -1.2700166702270508, "rewards/grad_term": 0.022815629839897156, "rewards/margins": 5.2170515060424805, "rewards/rejected": -6.487068176269531, "step": 195 }, { "epoch": 0.40658628289900167, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 47.96509016237826, "learning_rate": 8.869665513264129e-07, "logits/chosen": 0.5451265573501587, "logits/rejected": 0.6818545460700989, "logps/accuracies": 0.8125, "logps/chosen": -262.90087890625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -249.34771728515625, "logps/ref_rejected": -265.0631408691406, "logps/rejected": -335.938232421875, "loss": 0.5831, "rewards/accuracies": 0.875, "rewards/chosen": -1.3553152084350586, "rewards/grad_term": 0.012808618135750294, "rewards/margins": 5.732193946838379, "rewards/rejected": -7.087508201599121, "step": 196 }, { "epoch": 0.4086607027097109, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 21.697323150316873, "learning_rate": 8.858131487889273e-07, "logits/chosen": 0.11935015022754669, "logits/rejected": 0.2006658911705017, "logps/accuracies": 0.875, "logps/chosen": -336.0437927246094, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -329.3843078613281, "logps/ref_rejected": -327.40045166015625, "logps/rejected": -413.8304748535156, "loss": 0.6046, "rewards/accuracies": 1.0, "rewards/chosen": -0.6659499406814575, "rewards/grad_term": 0.00346172577701509, "rewards/margins": 7.977048873901367, "rewards/rejected": -8.642998695373535, "step": 197 }, { "epoch": 0.41073512252042005, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 40.358779912668254, "learning_rate": 8.846597462514417e-07, "logits/chosen": 0.23720747232437134, "logits/rejected": 0.2754895091056824, "logps/accuracies": 0.875, "logps/chosen": -296.7972106933594, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -265.8367614746094, "logps/ref_rejected": -281.0025939941406, "logps/rejected": -351.4774475097656, "loss": 0.625, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0960447788238525, "rewards/grad_term": 0.024012045934796333, "rewards/margins": 3.9514381885528564, "rewards/rejected": -7.047482967376709, "step": 198 }, { "epoch": 0.41280954233112926, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 92.31220771810881, "learning_rate": 8.835063437139562e-07, "logits/chosen": 0.4791252911090851, "logits/rejected": 0.5575248599052429, "logps/accuracies": 0.8125, "logps/chosen": -268.31036376953125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -276.9249572753906, "logps/ref_rejected": -266.7868957519531, "logps/rejected": -348.34381103515625, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": 0.861458420753479, "rewards/grad_term": 0.004701174795627594, "rewards/margins": 9.0171537399292, "rewards/rejected": -8.155694961547852, "step": 199 }, { "epoch": 0.4148839621418385, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 45.77620831393305, "learning_rate": 8.823529411764705e-07, "logits/chosen": 0.12551181018352509, "logits/rejected": 0.14135059714317322, "logps/accuracies": 0.9375, "logps/chosen": -315.8223571777344, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -310.73651123046875, "logps/ref_rejected": -292.3979797363281, "logps/rejected": -374.2222900390625, "loss": 0.6131, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5085856318473816, "rewards/grad_term": 0.004745251964777708, "rewards/margins": 7.673846244812012, "rewards/rejected": -8.182432174682617, "step": 200 }, { "epoch": 0.41695838195254764, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 40.635916417828575, "learning_rate": 8.81199538638985e-07, "logits/chosen": 0.06241011992096901, "logits/rejected": 0.11283601820468903, "logps/accuracies": 0.875, "logps/chosen": -293.8592834472656, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -296.7002258300781, "logps/ref_rejected": -303.21331787109375, "logps/rejected": -367.14044189453125, "loss": 0.5767, "rewards/accuracies": 0.9375, "rewards/chosen": 0.28409475088119507, "rewards/grad_term": 0.00995566789060831, "rewards/margins": 6.67680549621582, "rewards/rejected": -6.392710208892822, "step": 201 }, { "epoch": 0.41903280176325686, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.375, "grad_norm": 31.417005116540565, "learning_rate": 8.800461361014993e-07, "logits/chosen": 0.08436602354049683, "logits/rejected": 0.06602154672145844, "logps/accuracies": 0.625, "logps/chosen": -324.9912109375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -327.7398681640625, "logps/ref_rejected": -292.74981689453125, "logps/rejected": -361.5688171386719, "loss": 0.5621, "rewards/accuracies": 1.0, "rewards/chosen": 0.2748683989048004, "rewards/grad_term": 0.007309483364224434, "rewards/margins": 7.1567702293396, "rewards/rejected": -6.88190221786499, "step": 202 }, { "epoch": 0.421107221573966, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 35.05900906163333, "learning_rate": 8.788927335640138e-07, "logits/chosen": 0.20211604237556458, "logits/rejected": 0.21109752357006073, "logps/accuracies": 0.8125, "logps/chosen": -333.8841552734375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -335.36810302734375, "logps/ref_rejected": -322.5255126953125, "logps/rejected": -367.34234619140625, "loss": 0.5631, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14839425683021545, "rewards/grad_term": 0.019571855664253235, "rewards/margins": 4.630078315734863, "rewards/rejected": -4.481683731079102, "step": 203 }, { "epoch": 0.42318164138467523, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 32.259107415010234, "learning_rate": 8.777393310265282e-07, "logits/chosen": 0.27740195393562317, "logits/rejected": 0.37459149956703186, "logps/accuracies": 0.8125, "logps/chosen": -259.053466796875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -246.73666381835938, "logps/ref_rejected": -271.6988525390625, "logps/rejected": -328.5376281738281, "loss": 0.6065, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2316789627075195, "rewards/grad_term": 0.0200329702347517, "rewards/margins": 4.452197551727295, "rewards/rejected": -5.6838765144348145, "step": 204 }, { "epoch": 0.4252560611953844, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 32.608539229670114, "learning_rate": 8.765859284890427e-07, "logits/chosen": 0.16907714307308197, "logits/rejected": 0.20513200759887695, "logps/accuracies": 0.875, "logps/chosen": -244.8258056640625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -226.30160522460938, "logps/ref_rejected": -258.55389404296875, "logps/rejected": -322.78662109375, "loss": 0.5794, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8524205684661865, "rewards/grad_term": 0.019479090347886086, "rewards/margins": 4.570858001708984, "rewards/rejected": -6.42327880859375, "step": 205 }, { "epoch": 0.4273304810060936, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.125, "grad_norm": 42.93969277035671, "learning_rate": 8.754325259515571e-07, "logits/chosen": 0.19578887522220612, "logits/rejected": 0.24128146469593048, "logps/accuracies": 0.8125, "logps/chosen": -270.4194030761719, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -261.4314270019531, "logps/ref_rejected": -278.7731628417969, "logps/rejected": -349.106201171875, "loss": 0.5817, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8987985849380493, "rewards/grad_term": 0.01616433635354042, "rewards/margins": 6.134509086608887, "rewards/rejected": -7.0333075523376465, "step": 206 }, { "epoch": 0.4294049008168028, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 40.26953156215366, "learning_rate": 8.742791234140715e-07, "logits/chosen": 0.2927509844303131, "logits/rejected": 0.4132809340953827, "logps/accuracies": 0.75, "logps/chosen": -298.0523986816406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -286.35516357421875, "logps/ref_rejected": -299.4516296386719, "logps/rejected": -371.3787841796875, "loss": 0.5984, "rewards/accuracies": 0.8125, "rewards/chosen": -1.169724702835083, "rewards/grad_term": 0.015548234805464745, "rewards/margins": 6.022989749908447, "rewards/rejected": -7.192714214324951, "step": 207 }, { "epoch": 0.431479320627512, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 44.42958011244542, "learning_rate": 8.731257208765859e-07, "logits/chosen": 0.15052379667758942, "logits/rejected": 0.13466718792915344, "logps/accuracies": 0.75, "logps/chosen": -344.1072692871094, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -348.62744140625, "logps/ref_rejected": -322.577880859375, "logps/rejected": -371.2545471191406, "loss": 0.5599, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45202189683914185, "rewards/grad_term": 0.014142685569822788, "rewards/margins": 5.319693565368652, "rewards/rejected": -4.867671966552734, "step": 208 }, { "epoch": 0.4335537404382212, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.125, "grad_norm": 55.08631188728783, "learning_rate": 8.719723183391004e-07, "logits/chosen": 0.2708834409713745, "logits/rejected": 0.3263266980648041, "logps/accuracies": 0.875, "logps/chosen": -266.4635009765625, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -266.25146484375, "logps/ref_rejected": -271.44537353515625, "logps/rejected": -334.1143798828125, "loss": 0.5873, "rewards/accuracies": 0.9375, "rewards/chosen": -0.021202266216278076, "rewards/grad_term": 0.010078574530780315, "rewards/margins": 6.245699882507324, "rewards/rejected": -6.266901969909668, "step": 209 }, { "epoch": 0.43562816024893036, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 42.60799619320707, "learning_rate": 8.708189158016147e-07, "logits/chosen": 0.3283449113368988, "logits/rejected": 0.3115319013595581, "logps/accuracies": 0.875, "logps/chosen": -309.02783203125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -306.033935546875, "logps/ref_rejected": -304.4499816894531, "logps/rejected": -374.966064453125, "loss": 0.5595, "rewards/accuracies": 0.875, "rewards/chosen": -0.2993917167186737, "rewards/grad_term": 0.011198869906365871, "rewards/margins": 6.752218246459961, "rewards/rejected": -7.051610469818115, "step": 210 }, { "epoch": 0.4377025800596396, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 74.62504339808369, "learning_rate": 8.696655132641292e-07, "logits/chosen": 0.04295940697193146, "logits/rejected": 0.1364721655845642, "logps/accuracies": 0.75, "logps/chosen": -253.16937255859375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -249.6319580078125, "logps/ref_rejected": -297.3905029296875, "logps/rejected": -369.6263427734375, "loss": 0.5717, "rewards/accuracies": 0.875, "rewards/chosen": -0.3537403643131256, "rewards/grad_term": 0.011611053720116615, "rewards/margins": 6.869847297668457, "rewards/rejected": -7.223587989807129, "step": 211 }, { "epoch": 0.43977699987034874, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.125, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 42.048425807585254, "learning_rate": 8.685121107266435e-07, "logits/chosen": 0.2111242413520813, "logits/rejected": 0.26101890206336975, "logps/accuracies": 0.625, "logps/chosen": -333.13372802734375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -298.3570556640625, "logps/ref_rejected": -285.5286865234375, "logps/rejected": -340.77008056640625, "loss": 0.59, "rewards/accuracies": 0.75, "rewards/chosen": -3.4776673316955566, "rewards/grad_term": 0.029041055589914322, "rewards/margins": 2.046469211578369, "rewards/rejected": -5.524137020111084, "step": 212 }, { "epoch": 0.44185141968105796, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 30.70565260692096, "learning_rate": 8.67358708189158e-07, "logits/chosen": 0.24488027393817902, "logits/rejected": 0.3360682725906372, "logps/accuracies": 0.75, "logps/chosen": -306.7230529785156, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -302.4961242675781, "logps/ref_rejected": -303.74755859375, "logps/rejected": -375.0382080078125, "loss": 0.6463, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4226952791213989, "rewards/grad_term": 0.006055990234017372, "rewards/margins": 6.706371784210205, "rewards/rejected": -7.1290669441223145, "step": 213 }, { "epoch": 0.44392583949176717, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 58.908823582347665, "learning_rate": 8.662053056516724e-07, "logits/chosen": 0.2656205892562866, "logits/rejected": 0.29711484909057617, "logps/accuracies": 0.8125, "logps/chosen": -269.3460693359375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -255.48500061035156, "logps/ref_rejected": -259.42156982421875, "logps/rejected": -320.4120178222656, "loss": 0.5785, "rewards/accuracies": 0.875, "rewards/chosen": -1.3861079216003418, "rewards/grad_term": 0.017718670889735222, "rewards/margins": 4.71293830871582, "rewards/rejected": -6.099046230316162, "step": 214 }, { "epoch": 0.44600025930247633, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 56.61591688580251, "learning_rate": 8.650519031141868e-07, "logits/chosen": 0.35960566997528076, "logits/rejected": 0.3392384648323059, "logps/accuracies": 0.8125, "logps/chosen": -299.29644775390625, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -293.4299011230469, "logps/ref_rejected": -299.168701171875, "logps/rejected": -368.6275939941406, "loss": 0.5517, "rewards/accuracies": 0.875, "rewards/chosen": -0.5866526961326599, "rewards/grad_term": 0.011029000394046307, "rewards/margins": 6.359241008758545, "rewards/rejected": -6.94589376449585, "step": 215 }, { "epoch": 0.44807467911318555, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 35.14556178232318, "learning_rate": 8.638985005767012e-07, "logits/chosen": 0.11206863820552826, "logits/rejected": 0.19429105520248413, "logps/accuracies": 0.875, "logps/chosen": -280.0439453125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -273.8160705566406, "logps/ref_rejected": -276.4722595214844, "logps/rejected": -337.29913330078125, "loss": 0.6022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6227847933769226, "rewards/grad_term": 0.008503232151269913, "rewards/margins": 5.459905624389648, "rewards/rejected": -6.082690715789795, "step": 216 }, { "epoch": 0.4501490989238947, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 36.20806400718805, "learning_rate": 8.627450980392156e-07, "logits/chosen": -0.1600431501865387, "logits/rejected": -0.10942815244197845, "logps/accuracies": 0.875, "logps/chosen": -301.800048828125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -299.7960205078125, "logps/ref_rejected": -275.73529052734375, "logps/rejected": -364.4363098144531, "loss": 0.5609, "rewards/accuracies": 1.0, "rewards/chosen": -0.20040588080883026, "rewards/grad_term": 0.004247845150530338, "rewards/margins": 8.669699668884277, "rewards/rejected": -8.870105743408203, "step": 217 }, { "epoch": 0.4522235187346039, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.125, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 45.446157769285975, "learning_rate": 8.615916955017301e-07, "logits/chosen": 0.24637356400489807, "logits/rejected": 0.284047931432724, "logps/accuracies": 0.625, "logps/chosen": -249.30892944335938, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -239.27142333984375, "logps/ref_rejected": -260.403076171875, "logps/rejected": -308.0331726074219, "loss": 0.6314, "rewards/accuracies": 0.875, "rewards/chosen": -1.0037511587142944, "rewards/grad_term": 0.02335098199546337, "rewards/margins": 3.759258508682251, "rewards/rejected": -4.763010025024414, "step": 218 }, { "epoch": 0.4542979385453131, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 30.72090456802453, "learning_rate": 8.604382929642446e-07, "logits/chosen": 0.30798035860061646, "logits/rejected": 0.3721332848072052, "logps/accuracies": 0.875, "logps/chosen": -270.7388000488281, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -256.83319091796875, "logps/ref_rejected": -253.60997009277344, "logps/rejected": -320.3992004394531, "loss": 0.5356, "rewards/accuracies": 0.75, "rewards/chosen": -1.3905625343322754, "rewards/grad_term": 0.02029426395893097, "rewards/margins": 5.288358688354492, "rewards/rejected": -6.678920745849609, "step": 219 }, { "epoch": 0.4563723583560223, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 51.870714437465466, "learning_rate": 8.592848904267589e-07, "logits/chosen": 0.03684063255786896, "logits/rejected": 0.16017459332942963, "logps/accuracies": 0.75, "logps/chosen": -253.1938934326172, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -256.5488586425781, "logps/ref_rejected": -286.39129638671875, "logps/rejected": -353.81689453125, "loss": 0.5563, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3354969322681427, "rewards/grad_term": 0.007589938119053841, "rewards/margins": 7.078057765960693, "rewards/rejected": -6.742560386657715, "step": 220 }, { "epoch": 0.4584467781667315, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.625, "flips/incorrect->incorrect": 0.125, "grad_norm": 35.42772905894603, "learning_rate": 8.581314878892734e-07, "logits/chosen": 0.2873913049697876, "logits/rejected": 0.26383453607559204, "logps/accuracies": 0.875, "logps/chosen": -360.1090087890625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -353.25018310546875, "logps/ref_rejected": -333.6820068359375, "logps/rejected": -418.5933532714844, "loss": 0.5623, "rewards/accuracies": 1.0, "rewards/chosen": -0.6858816146850586, "rewards/grad_term": 0.0032467113342136145, "rewards/margins": 7.8052544593811035, "rewards/rejected": -8.491135597229004, "step": 221 }, { "epoch": 0.4605211979774407, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 84.34900463866556, "learning_rate": 8.569780853517877e-07, "logits/chosen": 0.1874726116657257, "logits/rejected": 0.2251831591129303, "logps/accuracies": 0.875, "logps/chosen": -264.17974853515625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -255.59912109375, "logps/ref_rejected": -261.2677001953125, "logps/rejected": -335.05792236328125, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": -0.858064591884613, "rewards/grad_term": 0.008338917046785355, "rewards/margins": 6.520959377288818, "rewards/rejected": -7.379024028778076, "step": 222 }, { "epoch": 0.4625956177881499, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 27.170194985611396, "learning_rate": 8.558246828143022e-07, "logits/chosen": 0.26863163709640503, "logits/rejected": 0.2670744061470032, "logps/accuracies": 0.875, "logps/chosen": -277.2497863769531, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -275.13916015625, "logps/ref_rejected": -272.11383056640625, "logps/rejected": -328.0150146484375, "loss": 0.5319, "rewards/accuracies": 0.9375, "rewards/chosen": -0.21106398105621338, "rewards/grad_term": 0.012204117141664028, "rewards/margins": 5.379053115844727, "rewards/rejected": -5.590117931365967, "step": 223 }, { "epoch": 0.46467003759885905, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 79.27462179097967, "learning_rate": 8.546712802768166e-07, "logits/chosen": 0.18783439695835114, "logits/rejected": 0.19196242094039917, "logps/accuracies": 0.875, "logps/chosen": -347.10906982421875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -339.8036193847656, "logps/ref_rejected": -325.4422912597656, "logps/rejected": -407.8504333496094, "loss": 0.6027, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7305458784103394, "rewards/grad_term": 0.007087053265422583, "rewards/margins": 7.510266304016113, "rewards/rejected": -8.240811347961426, "step": 224 }, { "epoch": 0.46674445740956827, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.25, "grad_norm": 35.29759314617677, "learning_rate": 8.53517877739331e-07, "logits/chosen": -0.19355978071689606, "logits/rejected": -0.016655761748552322, "logps/accuracies": 0.6875, "logps/chosen": -308.5700378417969, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -295.40545654296875, "logps/ref_rejected": -343.7320251464844, "logps/rejected": -400.0100402832031, "loss": 0.5589, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3164560794830322, "rewards/grad_term": 0.02172619104385376, "rewards/margins": 4.311344146728516, "rewards/rejected": -5.627799987792969, "step": 225 }, { "epoch": 0.46881887722027743, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 61.84351353540357, "learning_rate": 8.523644752018454e-07, "logits/chosen": 0.20059531927108765, "logits/rejected": 0.1843167543411255, "logps/accuracies": 0.75, "logps/chosen": -261.94525146484375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -262.80419921875, "logps/ref_rejected": -244.80892944335938, "logps/rejected": -298.7880554199219, "loss": 0.5996, "rewards/accuracies": 0.875, "rewards/chosen": 0.08589661121368408, "rewards/grad_term": 0.018526069819927216, "rewards/margins": 5.4838104248046875, "rewards/rejected": -5.397914409637451, "step": 226 }, { "epoch": 0.47089329703098665, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 26.516761004669306, "learning_rate": 8.512110726643598e-07, "logits/chosen": 0.08376497030258179, "logits/rejected": 0.1468810886144638, "logps/accuracies": 0.9375, "logps/chosen": -255.428466796875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -253.26193237304688, "logps/ref_rejected": -286.236572265625, "logps/rejected": -325.91131591796875, "loss": 0.6258, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21665439009666443, "rewards/grad_term": 0.02595067396759987, "rewards/margins": 3.75081729888916, "rewards/rejected": -3.9674713611602783, "step": 227 }, { "epoch": 0.47296771684169586, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 33.01960140111705, "learning_rate": 8.500576701268742e-07, "logits/chosen": 0.20910394191741943, "logits/rejected": 0.21387630701065063, "logps/accuracies": 0.8125, "logps/chosen": -304.03131103515625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -293.5747985839844, "logps/ref_rejected": -301.56024169921875, "logps/rejected": -366.596435546875, "loss": 0.5344, "rewards/accuracies": 0.875, "rewards/chosen": -1.0456496477127075, "rewards/grad_term": 0.01662050373852253, "rewards/margins": 5.4579668045043945, "rewards/rejected": -6.5036163330078125, "step": 228 }, { "epoch": 0.475042136652405, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 29.434714310320842, "learning_rate": 8.489042675893887e-07, "logits/chosen": 0.15450771152973175, "logits/rejected": 0.19930016994476318, "logps/accuracies": 0.8125, "logps/chosen": -294.2052917480469, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -294.3157653808594, "logps/ref_rejected": -289.5907897949219, "logps/rejected": -365.57635498046875, "loss": 0.5452, "rewards/accuracies": 1.0, "rewards/chosen": 0.011044904589653015, "rewards/grad_term": 0.0027202588971704245, "rewards/margins": 7.609601974487305, "rewards/rejected": -7.598557949066162, "step": 229 }, { "epoch": 0.47711655646311424, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0, "grad_norm": 38.213053143905036, "learning_rate": 8.477508650519031e-07, "logits/chosen": 0.2588901221752167, "logits/rejected": 0.44516509771347046, "logps/accuracies": 1.0, "logps/chosen": -291.5958251953125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -285.3277282714844, "logps/ref_rejected": -350.38787841796875, "logps/rejected": -426.9537353515625, "loss": 0.5646, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6268075704574585, "rewards/grad_term": 0.007204078137874603, "rewards/margins": 7.029778480529785, "rewards/rejected": -7.656586170196533, "step": 230 }, { "epoch": 0.4791909762738234, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 25.865551411840645, "learning_rate": 8.465974625144176e-07, "logits/chosen": 0.2644757926464081, "logits/rejected": 0.29192623496055603, "logps/accuracies": 0.875, "logps/chosen": -313.8361511230469, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -314.16973876953125, "logps/ref_rejected": -297.2124938964844, "logps/rejected": -363.6337890625, "loss": 0.5635, "rewards/accuracies": 0.875, "rewards/chosen": 0.03335915505886078, "rewards/grad_term": 0.01406506821513176, "rewards/margins": 6.675488471984863, "rewards/rejected": -6.642129421234131, "step": 231 }, { "epoch": 0.4812653960845326, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 20.4318067473308, "learning_rate": 8.454440599769319e-07, "logits/chosen": 0.18658952414989471, "logits/rejected": 0.25278547406196594, "logps/accuracies": 0.875, "logps/chosen": -266.6859436035156, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -272.0683288574219, "logps/ref_rejected": -291.2773132324219, "logps/rejected": -373.94342041015625, "loss": 0.5679, "rewards/accuracies": 1.0, "rewards/chosen": 0.5382405519485474, "rewards/grad_term": 0.002569821197539568, "rewards/margins": 8.804851531982422, "rewards/rejected": -8.266611099243164, "step": 232 }, { "epoch": 0.4833398158952418, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 26.814576411742852, "learning_rate": 8.442906574394463e-07, "logits/chosen": 0.22701847553253174, "logits/rejected": 0.5475070476531982, "logps/accuracies": 0.875, "logps/chosen": -319.60491943359375, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -312.2276611328125, "logps/ref_rejected": -340.19671630859375, "logps/rejected": -403.5230712890625, "loss": 0.5561, "rewards/accuracies": 1.0, "rewards/chosen": -0.7377276420593262, "rewards/grad_term": 0.011820271611213684, "rewards/margins": 5.594909191131592, "rewards/rejected": -6.33263635635376, "step": 233 }, { "epoch": 0.485414235705951, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 42.80927565779465, "learning_rate": 8.431372549019608e-07, "logits/chosen": 0.03597773611545563, "logits/rejected": 0.07471846044063568, "logps/accuracies": 0.9375, "logps/chosen": -313.0096435546875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -303.0143737792969, "logps/ref_rejected": -296.0675354003906, "logps/rejected": -380.6692199707031, "loss": 0.5752, "rewards/accuracies": 1.0, "rewards/chosen": -0.9995289444923401, "rewards/grad_term": 0.003610477549955249, "rewards/margins": 7.460636615753174, "rewards/rejected": -8.460165977478027, "step": 234 }, { "epoch": 0.4874886555166602, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.25, "grad_norm": 36.409560354500066, "learning_rate": 8.419838523644751e-07, "logits/chosen": 0.33159953355789185, "logits/rejected": 0.3636232912540436, "logps/accuracies": 0.75, "logps/chosen": -399.6930847167969, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -377.0238037109375, "logps/ref_rejected": -366.9959411621094, "logps/rejected": -440.5434875488281, "loss": 0.6419, "rewards/accuracies": 0.875, "rewards/chosen": -2.2669270038604736, "rewards/grad_term": 0.01446828804910183, "rewards/margins": 5.087828159332275, "rewards/rejected": -7.354754447937012, "step": 235 }, { "epoch": 0.48956307532736937, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 33.58443547525811, "learning_rate": 8.408304498269896e-07, "logits/chosen": 0.41071584820747375, "logits/rejected": 0.46553879976272583, "logps/accuracies": 0.875, "logps/chosen": -272.25848388671875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -277.6263427734375, "logps/ref_rejected": -272.1466369628906, "logps/rejected": -331.35101318359375, "loss": 0.5812, "rewards/accuracies": 1.0, "rewards/chosen": 0.5367855429649353, "rewards/grad_term": 0.007947854697704315, "rewards/margins": 6.457221031188965, "rewards/rejected": -5.920435905456543, "step": 236 }, { "epoch": 0.4916374951380786, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 32.073065465479466, "learning_rate": 8.396770472895039e-07, "logits/chosen": 0.35054367780685425, "logits/rejected": 0.38058048486709595, "logps/accuracies": 0.875, "logps/chosen": -266.0827331542969, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -259.3359375, "logps/ref_rejected": -264.76947021484375, "logps/rejected": -329.091552734375, "loss": 0.5934, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6746789216995239, "rewards/grad_term": 0.01457288395613432, "rewards/margins": 5.75752592086792, "rewards/rejected": -6.432204246520996, "step": 237 }, { "epoch": 0.49371191494878774, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 45.30830579519183, "learning_rate": 8.385236447520184e-07, "logits/chosen": 0.4467751979827881, "logits/rejected": 0.4529315233230591, "logps/accuracies": 0.8125, "logps/chosen": -294.876953125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -297.973876953125, "logps/ref_rejected": -294.9307861328125, "logps/rejected": -360.9711608886719, "loss": 0.559, "rewards/accuracies": 0.875, "rewards/chosen": 0.30968841910362244, "rewards/grad_term": 0.012534530833363533, "rewards/margins": 6.913724899291992, "rewards/rejected": -6.604036808013916, "step": 238 }, { "epoch": 0.49578633475949696, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 42.12849078368493, "learning_rate": 8.373702422145328e-07, "logits/chosen": 0.23043927550315857, "logits/rejected": 0.4118332862854004, "logps/accuracies": 0.75, "logps/chosen": -323.7980041503906, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -326.3075866699219, "logps/ref_rejected": -390.9226379394531, "logps/rejected": -446.6671142578125, "loss": 0.5826, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2509579062461853, "rewards/grad_term": 0.013378635980188847, "rewards/margins": 5.825405597686768, "rewards/rejected": -5.5744476318359375, "step": 239 }, { "epoch": 0.4978607545702061, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 31.909536036533762, "learning_rate": 8.362168396770472e-07, "logits/chosen": 0.16586509346961975, "logits/rejected": 0.25248032808303833, "logps/accuracies": 0.875, "logps/chosen": -282.19866943359375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -285.1270751953125, "logps/ref_rejected": -276.7912902832031, "logps/rejected": -345.7266845703125, "loss": 0.5355, "rewards/accuracies": 1.0, "rewards/chosen": 0.29284125566482544, "rewards/grad_term": 0.007976886816322803, "rewards/margins": 7.186383247375488, "rewards/rejected": -6.89354133605957, "step": 240 }, { "epoch": 0.49993517438091534, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 30.525809924473112, "learning_rate": 8.350634371395616e-07, "logits/chosen": 0.3237009048461914, "logits/rejected": 0.40990960597991943, "logps/accuracies": 0.875, "logps/chosen": -287.2841796875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -285.5251159667969, "logps/ref_rejected": -302.3129577636719, "logps/rejected": -370.07891845703125, "loss": 0.5961, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1759052574634552, "rewards/grad_term": 0.010726590640842915, "rewards/margins": 6.6006951332092285, "rewards/rejected": -6.776600360870361, "step": 241 }, { "epoch": 0.5020095941916245, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 31.349241798619, "learning_rate": 8.33910034602076e-07, "logits/chosen": 0.05717964842915535, "logits/rejected": 0.0796816349029541, "logps/accuracies": 0.8125, "logps/chosen": -295.54229736328125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -297.224365234375, "logps/ref_rejected": -284.146728515625, "logps/rejected": -344.3078918457031, "loss": 0.578, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16821032762527466, "rewards/grad_term": 0.014522448182106018, "rewards/margins": 6.184324264526367, "rewards/rejected": -6.016113758087158, "step": 242 }, { "epoch": 0.5040840140023337, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 33.030305222542474, "learning_rate": 8.327566320645905e-07, "logits/chosen": 0.06713651120662689, "logits/rejected": 0.08016189187765121, "logps/accuracies": 0.9375, "logps/chosen": -268.2383117675781, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -267.01324462890625, "logps/ref_rejected": -280.7307434082031, "logps/rejected": -350.9720458984375, "loss": 0.557, "rewards/accuracies": 1.0, "rewards/chosen": -0.12250781059265137, "rewards/grad_term": 0.006276478059589863, "rewards/margins": 6.901622772216797, "rewards/rejected": -7.024130344390869, "step": 243 }, { "epoch": 0.5061584338130429, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 27.004780238727747, "learning_rate": 8.31603229527105e-07, "logits/chosen": 0.04987862706184387, "logits/rejected": 0.018930042162537575, "logps/accuracies": 0.75, "logps/chosen": -304.90655517578125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -304.6758728027344, "logps/ref_rejected": -300.8800048828125, "logps/rejected": -368.5686950683594, "loss": 0.5892, "rewards/accuracies": 0.875, "rewards/chosen": -0.023071274161338806, "rewards/grad_term": 0.01271775085479021, "rewards/margins": 6.745797157287598, "rewards/rejected": -6.768868923187256, "step": 244 }, { "epoch": 0.5082328536237521, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 39.398222375890796, "learning_rate": 8.304498269896193e-07, "logits/chosen": 0.15408623218536377, "logits/rejected": 0.16406217217445374, "logps/accuracies": 0.8125, "logps/chosen": -318.6749572753906, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -316.9960632324219, "logps/ref_rejected": -343.3831787109375, "logps/rejected": -391.7142333984375, "loss": 0.5758, "rewards/accuracies": 0.875, "rewards/chosen": -0.16788998246192932, "rewards/grad_term": 0.01882031187415123, "rewards/margins": 4.6652140617370605, "rewards/rejected": -4.833104133605957, "step": 245 }, { "epoch": 0.5103072734344613, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 57.065723685654525, "learning_rate": 8.292964244521338e-07, "logits/chosen": 0.21645879745483398, "logits/rejected": 0.2783927619457245, "logps/accuracies": 0.75, "logps/chosen": -304.6559143066406, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -307.0755615234375, "logps/ref_rejected": -296.75225830078125, "logps/rejected": -353.28765869140625, "loss": 0.5619, "rewards/accuracies": 0.875, "rewards/chosen": 0.241965651512146, "rewards/grad_term": 0.016664672642946243, "rewards/margins": 5.895508289337158, "rewards/rejected": -5.6535420417785645, "step": 246 }, { "epoch": 0.5123816932451705, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 32.148990652138416, "learning_rate": 8.281430219146481e-07, "logits/chosen": 0.19208469986915588, "logits/rejected": 0.09641852974891663, "logps/accuracies": 0.6875, "logps/chosen": -406.9360046386719, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -399.0845947265625, "logps/ref_rejected": -377.1170349121094, "logps/rejected": -460.0271911621094, "loss": 0.5832, "rewards/accuracies": 1.0, "rewards/chosen": -0.7851426005363464, "rewards/grad_term": 0.0035166891757398844, "rewards/margins": 7.505876064300537, "rewards/rejected": -8.29101848602295, "step": 247 }, { "epoch": 0.5144561130558797, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 44.560082669866084, "learning_rate": 8.269896193771626e-07, "logits/chosen": 0.4188195765018463, "logits/rejected": 0.4568687975406647, "logps/accuracies": 0.8125, "logps/chosen": -319.69805908203125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -304.82037353515625, "logps/ref_rejected": -328.6085205078125, "logps/rejected": -397.77398681640625, "loss": 0.5775, "rewards/accuracies": 1.0, "rewards/chosen": -1.4877678155899048, "rewards/grad_term": 0.007945088669657707, "rewards/margins": 5.428779602050781, "rewards/rejected": -6.916546821594238, "step": 248 }, { "epoch": 0.5165305328665889, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 49.37389001042154, "learning_rate": 8.25836216839677e-07, "logits/chosen": 0.0985247939825058, "logits/rejected": 0.13652461767196655, "logps/accuracies": 0.875, "logps/chosen": -316.8545837402344, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -325.44244384765625, "logps/ref_rejected": -335.2078857421875, "logps/rejected": -413.33251953125, "loss": 0.5037, "rewards/accuracies": 1.0, "rewards/chosen": 0.8587861657142639, "rewards/grad_term": 0.004567756317555904, "rewards/margins": 8.671252250671387, "rewards/rejected": -7.812466144561768, "step": 249 }, { "epoch": 0.5186049526772981, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0, "grad_norm": 55.51867340899016, "learning_rate": 8.246828143021914e-07, "logits/chosen": 0.28938692808151245, "logits/rejected": 0.27797943353652954, "logps/accuracies": 1.0, "logps/chosen": -322.72845458984375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -325.44219970703125, "logps/ref_rejected": -326.6589660644531, "logps/rejected": -402.5311279296875, "loss": 0.5775, "rewards/accuracies": 0.9375, "rewards/chosen": 0.27137255668640137, "rewards/grad_term": 0.01095657143741846, "rewards/margins": 7.858592510223389, "rewards/rejected": -7.587219715118408, "step": 250 }, { "epoch": 0.5206793724880072, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 56.0035451408766, "learning_rate": 8.235294117647058e-07, "logits/chosen": 0.2637563943862915, "logits/rejected": 0.33145201206207275, "logps/accuracies": 0.8125, "logps/chosen": -395.2080078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -379.0206604003906, "logps/ref_rejected": -411.3179931640625, "logps/rejected": -493.62921142578125, "loss": 0.5508, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6187384128570557, "rewards/grad_term": 0.009982087649405003, "rewards/margins": 6.612382411956787, "rewards/rejected": -8.231120109558105, "step": 251 }, { "epoch": 0.5227537922987164, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 49.66999137567622, "learning_rate": 8.223760092272203e-07, "logits/chosen": 0.15164095163345337, "logits/rejected": 0.20300878584384918, "logps/accuracies": 0.75, "logps/chosen": -314.79412841796875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -317.7497253417969, "logps/ref_rejected": -307.1460876464844, "logps/rejected": -389.70745849609375, "loss": 0.5436, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2955569326877594, "rewards/grad_term": 0.00618112925440073, "rewards/margins": 8.551695823669434, "rewards/rejected": -8.256139755249023, "step": 252 }, { "epoch": 0.5248282121094257, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 39.0728876643306, "learning_rate": 8.212226066897346e-07, "logits/chosen": 0.4079715609550476, "logits/rejected": 0.5913187861442566, "logps/accuracies": 0.875, "logps/chosen": -299.94525146484375, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -300.0797119140625, "logps/ref_rejected": -358.2881164550781, "logps/rejected": -424.64324951171875, "loss": 0.494, "rewards/accuracies": 0.875, "rewards/chosen": 0.01344829797744751, "rewards/grad_term": 0.008727732114493847, "rewards/margins": 6.648958683013916, "rewards/rejected": -6.635509967803955, "step": 253 }, { "epoch": 0.5269026319201349, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 31.821418285963002, "learning_rate": 8.200692041522491e-07, "logits/chosen": 0.4014374613761902, "logits/rejected": 0.4343331456184387, "logps/accuracies": 0.9375, "logps/chosen": -231.1729278564453, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -230.2028045654297, "logps/ref_rejected": -237.1446075439453, "logps/rejected": -303.7888488769531, "loss": 0.5901, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09701316803693771, "rewards/grad_term": 0.006798036862164736, "rewards/margins": 6.56741189956665, "rewards/rejected": -6.664424896240234, "step": 254 }, { "epoch": 0.5289770517308441, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 49.53309810118567, "learning_rate": 8.189158016147634e-07, "logits/chosen": 0.10699253529310226, "logits/rejected": 0.11342119425535202, "logps/accuracies": 0.75, "logps/chosen": -301.25408935546875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -310.416748046875, "logps/ref_rejected": -280.7598876953125, "logps/rejected": -345.6063537597656, "loss": 0.6471, "rewards/accuracies": 0.875, "rewards/chosen": 0.9162629842758179, "rewards/grad_term": 0.009691519662737846, "rewards/margins": 7.400913715362549, "rewards/rejected": -6.484650611877441, "step": 255 }, { "epoch": 0.5310514715415532, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 33.11473943842344, "learning_rate": 8.17762399077278e-07, "logits/chosen": 0.09529374539852142, "logits/rejected": 0.2966251075267792, "logps/accuracies": 0.6875, "logps/chosen": -278.4207763671875, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -268.88885498046875, "logps/ref_rejected": -287.082275390625, "logps/rejected": -344.5219421386719, "loss": 0.5951, "rewards/accuracies": 0.75, "rewards/chosen": -0.9531930088996887, "rewards/grad_term": 0.021204093471169472, "rewards/margins": 4.790775775909424, "rewards/rejected": -5.743968963623047, "step": 256 }, { "epoch": 0.5310514715415532, "eval_flips/correct->correct": 0.4334975481033325, "eval_flips/correct->incorrect": 0.009852216579020023, "eval_flips/incorrect->correct": 0.3300492465496063, "eval_flips/incorrect->incorrect": 0.2266009896993637, "eval_logits/chosen": 0.20908966660499573, "eval_logits/rejected": 0.25232627987861633, "eval_logps/accuracies": 0.7635468244552612, "eval_logps/chosen": -291.91790771484375, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -345.9736328125, "eval_loss": 0.6100751161575317, "eval_rewards/accuracies": 0.8768472671508789, "eval_rewards/chosen": -0.4566830098628998, "eval_rewards/grad_term": 0.015678314492106438, "eval_rewards/margins": 5.236079216003418, "eval_rewards/rejected": -5.69276237487793, "eval_runtime": 803.7781, "eval_samples_per_second": 2.013, "eval_steps_per_second": 0.253, "step": 256 }, { "epoch": 0.5331258913522624, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 37.54918040748534, "learning_rate": 8.166089965397924e-07, "logits/chosen": 0.14258748292922974, "logits/rejected": 0.1967656910419464, "logps/accuracies": 0.875, "logps/chosen": -307.8832702636719, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -313.5946350097656, "logps/ref_rejected": -304.44818115234375, "logps/rejected": -381.94158935546875, "loss": 0.5998, "rewards/accuracies": 1.0, "rewards/chosen": 0.5711380243301392, "rewards/grad_term": 0.004750548396259546, "rewards/margins": 8.320480346679688, "rewards/rejected": -7.749342918395996, "step": 257 }, { "epoch": 0.5352003111629716, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 66.7330393421814, "learning_rate": 8.154555940023068e-07, "logits/chosen": 0.3797518014907837, "logits/rejected": 0.36165231466293335, "logps/accuracies": 0.8125, "logps/chosen": -305.75604248046875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -297.5378112792969, "logps/ref_rejected": -278.42840576171875, "logps/rejected": -340.3268127441406, "loss": 0.5622, "rewards/accuracies": 0.875, "rewards/chosen": -0.8218250274658203, "rewards/grad_term": 0.01640998013317585, "rewards/margins": 5.368016242980957, "rewards/rejected": -6.189841270446777, "step": 258 }, { "epoch": 0.5372747309736808, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 38.0243266757412, "learning_rate": 8.143021914648212e-07, "logits/chosen": 0.31451526284217834, "logits/rejected": 0.3426423668861389, "logps/accuracies": 0.75, "logps/chosen": -214.24871826171875, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -212.14849853515625, "logps/ref_rejected": -200.25625610351562, "logps/rejected": -251.4813232421875, "loss": 0.5821, "rewards/accuracies": 0.875, "rewards/chosen": -0.210022434592247, "rewards/grad_term": 0.01893402822315693, "rewards/margins": 4.912485122680664, "rewards/rejected": -5.1225080490112305, "step": 259 }, { "epoch": 0.5393491507843899, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 27.658939795121892, "learning_rate": 8.131487889273356e-07, "logits/chosen": 0.1209304928779602, "logits/rejected": 0.1580743044614792, "logps/accuracies": 0.6875, "logps/chosen": -291.9943542480469, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -296.20867919921875, "logps/ref_rejected": -299.74664306640625, "logps/rejected": -365.398193359375, "loss": 0.5422, "rewards/accuracies": 1.0, "rewards/chosen": 0.4214297831058502, "rewards/grad_term": 0.009202235378324986, "rewards/margins": 6.986582279205322, "rewards/rejected": -6.565152645111084, "step": 260 }, { "epoch": 0.5414235705950992, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 75.72204106158672, "learning_rate": 8.1199538638985e-07, "logits/chosen": 0.16947351396083832, "logits/rejected": 0.18022188544273376, "logps/accuracies": 0.75, "logps/chosen": -281.2515869140625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -267.09417724609375, "logps/ref_rejected": -272.4349060058594, "logps/rejected": -333.783203125, "loss": 0.606, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4157400131225586, "rewards/grad_term": 0.01815151423215866, "rewards/margins": 4.719089984893799, "rewards/rejected": -6.134829998016357, "step": 261 }, { "epoch": 0.5434979904058084, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 58.5042523870511, "learning_rate": 8.108419838523645e-07, "logits/chosen": 0.18119366466999054, "logits/rejected": 0.3171493113040924, "logps/accuracies": 0.875, "logps/chosen": -298.76116943359375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -292.3468017578125, "logps/ref_rejected": -334.3692321777344, "logps/rejected": -419.0423278808594, "loss": 0.5495, "rewards/accuracies": 1.0, "rewards/chosen": -0.6414406299591064, "rewards/grad_term": 0.005858146119862795, "rewards/margins": 7.825870990753174, "rewards/rejected": -8.467310905456543, "step": 262 }, { "epoch": 0.5455724102165176, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 16.445083432164537, "learning_rate": 8.096885813148788e-07, "logits/chosen": 0.37694644927978516, "logits/rejected": 0.43579670786857605, "logps/accuracies": 0.875, "logps/chosen": -358.7106018066406, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -354.29962158203125, "logps/ref_rejected": -385.1765441894531, "logps/rejected": -465.8778381347656, "loss": 0.5401, "rewards/accuracies": 0.9375, "rewards/chosen": -0.44109994173049927, "rewards/grad_term": 0.005547558423131704, "rewards/margins": 7.6290283203125, "rewards/rejected": -8.070128440856934, "step": 263 }, { "epoch": 0.5476468300272268, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 43.5572613048707, "learning_rate": 8.085351787773933e-07, "logits/chosen": 0.3072161078453064, "logits/rejected": 0.2626444697380066, "logps/accuracies": 0.8125, "logps/chosen": -259.9176330566406, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -245.10406494140625, "logps/ref_rejected": -250.30921936035156, "logps/rejected": -325.20831298828125, "loss": 0.5928, "rewards/accuracies": 1.0, "rewards/chosen": -1.4813560247421265, "rewards/grad_term": 0.01034230925142765, "rewards/margins": 6.008551597595215, "rewards/rejected": -7.489907264709473, "step": 264 }, { "epoch": 0.5497212498379359, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 42.82203006051057, "learning_rate": 8.073817762399076e-07, "logits/chosen": 0.13615286350250244, "logits/rejected": 0.20001475512981415, "logps/accuracies": 0.75, "logps/chosen": -338.6938171386719, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -319.03753662109375, "logps/ref_rejected": -324.310791015625, "logps/rejected": -421.2921447753906, "loss": 0.5756, "rewards/accuracies": 1.0, "rewards/chosen": -1.9656240940093994, "rewards/grad_term": 0.006605319678783417, "rewards/margins": 7.732507705688477, "rewards/rejected": -9.698131561279297, "step": 265 }, { "epoch": 0.5517956696486451, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 52.44815820061085, "learning_rate": 8.062283737024221e-07, "logits/chosen": 0.21208931505680084, "logits/rejected": 0.25778982043266296, "logps/accuracies": 0.875, "logps/chosen": -344.6730651855469, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -331.10345458984375, "logps/ref_rejected": -351.76409912109375, "logps/rejected": -448.85931396484375, "loss": 0.613, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3569657802581787, "rewards/grad_term": 0.00555332051590085, "rewards/margins": 8.352553367614746, "rewards/rejected": -9.70952033996582, "step": 266 }, { "epoch": 0.5538700894593543, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.125, "grad_norm": 41.602018473752594, "learning_rate": 8.050749711649365e-07, "logits/chosen": 0.028799353167414665, "logits/rejected": 0.013060306198894978, "logps/accuracies": 0.875, "logps/chosen": -331.1357727050781, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -303.6278076171875, "logps/ref_rejected": -292.3013610839844, "logps/rejected": -394.4818115234375, "loss": 0.6173, "rewards/accuracies": 1.0, "rewards/chosen": -2.750793933868408, "rewards/grad_term": 0.004837782587856054, "rewards/margins": 7.467255115509033, "rewards/rejected": -10.218048095703125, "step": 267 }, { "epoch": 0.5559445092700636, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 113.12904360596185, "learning_rate": 8.03921568627451e-07, "logits/chosen": 0.02135728858411312, "logits/rejected": 0.08749254792928696, "logps/accuracies": 0.8125, "logps/chosen": -336.35760498046875, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -308.16162109375, "logps/ref_rejected": -309.57598876953125, "logps/rejected": -400.8465576171875, "loss": 0.6239, "rewards/accuracies": 0.75, "rewards/chosen": -2.8195974826812744, "rewards/grad_term": 0.018539071083068848, "rewards/margins": 6.307459831237793, "rewards/rejected": -9.127056121826172, "step": 268 }, { "epoch": 0.5580189290807728, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 33.96227826132554, "learning_rate": 8.027681660899654e-07, "logits/chosen": 0.5619252324104309, "logits/rejected": 0.5743327736854553, "logps/accuracies": 0.9375, "logps/chosen": -255.11105346679688, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -252.5300750732422, "logps/ref_rejected": -270.99432373046875, "logps/rejected": -342.1889953613281, "loss": 0.5774, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2580994963645935, "rewards/grad_term": 0.01020655408501625, "rewards/margins": 6.861366271972656, "rewards/rejected": -7.1194658279418945, "step": 269 }, { "epoch": 0.5600933488914819, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.1875, "grad_norm": 24.02468529781505, "learning_rate": 8.016147635524798e-07, "logits/chosen": 0.27724260091781616, "logits/rejected": 0.2910709083080292, "logps/accuracies": 0.8125, "logps/chosen": -300.6127014160156, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -277.4232482910156, "logps/ref_rejected": -294.59234619140625, "logps/rejected": -373.547607421875, "loss": 0.5739, "rewards/accuracies": 0.875, "rewards/chosen": -2.3189470767974854, "rewards/grad_term": 0.014720053412020206, "rewards/margins": 5.57658052444458, "rewards/rejected": -7.895526885986328, "step": 270 }, { "epoch": 0.5621677687021911, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 28.444387072372734, "learning_rate": 8.004613610149942e-07, "logits/chosen": 0.05013295263051987, "logits/rejected": 0.06607392430305481, "logps/accuracies": 0.9375, "logps/chosen": -278.4317626953125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -279.4646301269531, "logps/ref_rejected": -289.01959228515625, "logps/rejected": -344.6971130371094, "loss": 0.5915, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10328565537929535, "rewards/grad_term": 0.00966467522084713, "rewards/margins": 5.671037673950195, "rewards/rejected": -5.567751884460449, "step": 271 }, { "epoch": 0.5642421885129003, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 51.22016296104716, "learning_rate": 7.993079584775087e-07, "logits/chosen": 0.37125492095947266, "logits/rejected": 0.3817085325717926, "logps/accuracies": 0.75, "logps/chosen": -308.3736572265625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -300.4655456542969, "logps/ref_rejected": -302.07696533203125, "logps/rejected": -366.42919921875, "loss": 0.5656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7908135652542114, "rewards/grad_term": 0.007430646568536758, "rewards/margins": 5.644411563873291, "rewards/rejected": -6.435225009918213, "step": 272 }, { "epoch": 0.5663166083236095, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 32.46457915793264, "learning_rate": 7.98154555940023e-07, "logits/chosen": 0.17831876873970032, "logits/rejected": 0.1457141786813736, "logps/accuracies": 0.75, "logps/chosen": -282.64410400390625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -274.8734130859375, "logps/ref_rejected": -274.3963317871094, "logps/rejected": -331.91021728515625, "loss": 0.5866, "rewards/accuracies": 0.875, "rewards/chosen": -0.7770657539367676, "rewards/grad_term": 0.016797857359051704, "rewards/margins": 4.974320411682129, "rewards/rejected": -5.751385688781738, "step": 273 }, { "epoch": 0.5683910281343186, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 32.9471484951805, "learning_rate": 7.970011534025375e-07, "logits/chosen": 0.5048956871032715, "logits/rejected": 0.496670126914978, "logps/accuracies": 0.8125, "logps/chosen": -301.3432922363281, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -304.7401123046875, "logps/ref_rejected": -310.8699645996094, "logps/rejected": -359.89007568359375, "loss": 0.617, "rewards/accuracies": 0.875, "rewards/chosen": 0.33968228101730347, "rewards/grad_term": 0.01532800029963255, "rewards/margins": 5.24169397354126, "rewards/rejected": -4.902011871337891, "step": 274 }, { "epoch": 0.5704654479450278, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 26.58072307751858, "learning_rate": 7.958477508650518e-07, "logits/chosen": 0.1677144318819046, "logits/rejected": 0.2207137495279312, "logps/accuracies": 0.75, "logps/chosen": -240.13754272460938, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -248.08505249023438, "logps/ref_rejected": -233.90797424316406, "logps/rejected": -271.0197448730469, "loss": 0.6325, "rewards/accuracies": 0.875, "rewards/chosen": 0.7947514653205872, "rewards/grad_term": 0.017878375947475433, "rewards/margins": 4.505929946899414, "rewards/rejected": -3.711178779602051, "step": 275 }, { "epoch": 0.5725398677557371, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 41.27546406513188, "learning_rate": 7.946943483275663e-07, "logits/chosen": 0.36588138341903687, "logits/rejected": 0.4112645983695984, "logps/accuracies": 0.75, "logps/chosen": -253.30035400390625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -247.00111389160156, "logps/ref_rejected": -257.2711486816406, "logps/rejected": -302.595703125, "loss": 0.6566, "rewards/accuracies": 1.0, "rewards/chosen": -0.6299245357513428, "rewards/grad_term": 0.020320266485214233, "rewards/margins": 3.9025347232818604, "rewards/rejected": -4.532459259033203, "step": 276 }, { "epoch": 0.5746142875664463, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.375, "grad_norm": 65.27110995983777, "learning_rate": 7.935409457900807e-07, "logits/chosen": -0.01693597435951233, "logits/rejected": 0.046872012317180634, "logps/accuracies": 0.625, "logps/chosen": -275.9945373535156, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -282.8183898925781, "logps/ref_rejected": -310.315673828125, "logps/rejected": -344.1127014160156, "loss": 0.6913, "rewards/accuracies": 0.9375, "rewards/chosen": 0.682384192943573, "rewards/grad_term": 0.023220881819725037, "rewards/margins": 4.062088489532471, "rewards/rejected": -3.379704475402832, "step": 277 }, { "epoch": 0.5766887073771555, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 51.19472036277012, "learning_rate": 7.923875432525951e-07, "logits/chosen": 0.19429253041744232, "logits/rejected": 0.19109566509723663, "logps/accuracies": 0.6875, "logps/chosen": -299.841796875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -306.51116943359375, "logps/ref_rejected": -324.4086608886719, "logps/rejected": -347.66070556640625, "loss": 0.7188, "rewards/accuracies": 0.9375, "rewards/chosen": 0.666935920715332, "rewards/grad_term": 0.025834256783127785, "rewards/margins": 2.9921374320983887, "rewards/rejected": -2.3252012729644775, "step": 278 }, { "epoch": 0.5787631271878646, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 111.63212030223526, "learning_rate": 7.912341407151095e-07, "logits/chosen": 0.026315703988075256, "logits/rejected": 0.05185367166996002, "logps/accuracies": 0.5625, "logps/chosen": -312.5485534667969, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -322.3186340332031, "logps/ref_rejected": -310.2381286621094, "logps/rejected": -346.5958251953125, "loss": 0.6961, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9770085215568542, "rewards/grad_term": 0.019868649542331696, "rewards/margins": 4.612778663635254, "rewards/rejected": -3.635770082473755, "step": 279 }, { "epoch": 0.5808375469985738, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.25, "grad_norm": 40.5130776185035, "learning_rate": 7.90080738177624e-07, "logits/chosen": 0.32091373205184937, "logits/rejected": 0.4111550450325012, "logps/accuracies": 0.75, "logps/chosen": -203.61636352539062, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -205.00054931640625, "logps/ref_rejected": -283.1324462890625, "logps/rejected": -327.993896484375, "loss": 0.6669, "rewards/accuracies": 0.875, "rewards/chosen": 0.1384190022945404, "rewards/grad_term": 0.016185998916625977, "rewards/margins": 4.624567985534668, "rewards/rejected": -4.486148357391357, "step": 280 }, { "epoch": 0.582911966809283, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 39.877534822937, "learning_rate": 7.889273356401384e-07, "logits/chosen": 0.3415575325489044, "logits/rejected": 0.360428124666214, "logps/accuracies": 0.75, "logps/chosen": -321.03564453125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -328.7980651855469, "logps/ref_rejected": -312.2154541015625, "logps/rejected": -357.23773193359375, "loss": 0.6027, "rewards/accuracies": 0.875, "rewards/chosen": 0.7762415409088135, "rewards/grad_term": 0.020651506260037422, "rewards/margins": 5.278467655181885, "rewards/rejected": -4.50222635269165, "step": 281 }, { "epoch": 0.5849863866199922, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.375, "grad_norm": 20.370696721525093, "learning_rate": 7.877739331026529e-07, "logits/chosen": -0.10976716130971909, "logits/rejected": 0.04267115890979767, "logps/accuracies": 0.5625, "logps/chosen": -316.794189453125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -321.988525390625, "logps/ref_rejected": -346.69873046875, "logps/rejected": -393.5243225097656, "loss": 0.5422, "rewards/accuracies": 0.875, "rewards/chosen": 0.519432783126831, "rewards/grad_term": 0.019725538790225983, "rewards/margins": 5.201993942260742, "rewards/rejected": -4.682560920715332, "step": 282 }, { "epoch": 0.5870608064307015, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 38.36199057696001, "learning_rate": 7.866205305651672e-07, "logits/chosen": 0.11121785640716553, "logits/rejected": 0.21377022564411163, "logps/accuracies": 0.9375, "logps/chosen": -260.2328186035156, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -243.456298828125, "logps/ref_rejected": -277.0938720703125, "logps/rejected": -323.23675537109375, "loss": 0.5608, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6776524782180786, "rewards/grad_term": 0.029680585488677025, "rewards/margins": 2.9366343021392822, "rewards/rejected": -4.61428689956665, "step": 283 }, { "epoch": 0.5891352262414106, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 18.32697126494198, "learning_rate": 7.854671280276817e-07, "logits/chosen": 0.09643738716840744, "logits/rejected": 0.13956782221794128, "logps/accuracies": 0.6875, "logps/chosen": -354.23077392578125, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -343.4710388183594, "logps/ref_rejected": -336.4991760253906, "logps/rejected": -402.7376403808594, "loss": 0.6159, "rewards/accuracies": 0.9375, "rewards/chosen": -1.075973629951477, "rewards/grad_term": 0.011539540253579617, "rewards/margins": 5.547872543334961, "rewards/rejected": -6.623846054077148, "step": 284 }, { "epoch": 0.5912096460521198, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 52.40142941235436, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.11267786473035812, "logits/rejected": 0.16389338672161102, "logps/accuracies": 0.875, "logps/chosen": -330.61627197265625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -335.897216796875, "logps/ref_rejected": -348.2872619628906, "logps/rejected": -430.4730224609375, "loss": 0.5283, "rewards/accuracies": 1.0, "rewards/chosen": 0.5280970931053162, "rewards/grad_term": 0.00416451646015048, "rewards/margins": 8.746676445007324, "rewards/rejected": -8.218579292297363, "step": 285 }, { "epoch": 0.593284065862829, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 47.45081130082864, "learning_rate": 7.831603229527105e-07, "logits/chosen": -0.08656018227338791, "logits/rejected": -0.05061071738600731, "logps/accuracies": 0.875, "logps/chosen": -304.7203369140625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -296.37933349609375, "logps/ref_rejected": -303.2038269042969, "logps/rejected": -393.2415771484375, "loss": 0.5669, "rewards/accuracies": 0.875, "rewards/chosen": -0.8340997695922852, "rewards/grad_term": 0.011894619092345238, "rewards/margins": 8.169673919677734, "rewards/rejected": -9.003772735595703, "step": 286 }, { "epoch": 0.5953584856735382, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 21.830125532823576, "learning_rate": 7.820069204152249e-07, "logits/chosen": 0.16923511028289795, "logits/rejected": 0.16749337315559387, "logps/accuracies": 0.8125, "logps/chosen": -300.83929443359375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -280.837158203125, "logps/ref_rejected": -279.3500671386719, "logps/rejected": -371.1993103027344, "loss": 0.5992, "rewards/accuracies": 0.9375, "rewards/chosen": -2.000213623046875, "rewards/grad_term": 0.007134607993066311, "rewards/margins": 7.1847124099731445, "rewards/rejected": -9.184926986694336, "step": 287 }, { "epoch": 0.5974329054842473, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 29.122328000376466, "learning_rate": 7.808535178777393e-07, "logits/chosen": 0.027427153661847115, "logits/rejected": 0.04789198189973831, "logps/accuracies": 0.8125, "logps/chosen": -390.447509765625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -367.87188720703125, "logps/ref_rejected": -355.06640625, "logps/rejected": -450.7095031738281, "loss": 0.5902, "rewards/accuracies": 0.875, "rewards/chosen": -2.2575621604919434, "rewards/grad_term": 0.009585607796907425, "rewards/margins": 7.306746482849121, "rewards/rejected": -9.564309120178223, "step": 288 }, { "epoch": 0.5995073252949565, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 41.33190625329714, "learning_rate": 7.797001153402537e-07, "logits/chosen": 0.07080674171447754, "logits/rejected": 0.10809577256441116, "logps/accuracies": 0.75, "logps/chosen": -280.20654296875, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -284.79046630859375, "logps/ref_rejected": -268.11834716796875, "logps/rejected": -343.5045166015625, "loss": 0.5622, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45839110016822815, "rewards/grad_term": 0.006786561571061611, "rewards/margins": 7.997011661529541, "rewards/rejected": -7.5386199951171875, "step": 289 }, { "epoch": 0.6015817451056658, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 22.37995830872316, "learning_rate": 7.785467128027681e-07, "logits/chosen": 0.046341296285390854, "logits/rejected": 0.08901657164096832, "logps/accuracies": 1.0, "logps/chosen": -315.9812927246094, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -314.4349365234375, "logps/ref_rejected": -321.4173583984375, "logps/rejected": -420.6228942871094, "loss": 0.5426, "rewards/accuracies": 1.0, "rewards/chosen": -0.15463726222515106, "rewards/grad_term": 0.0006936362478882074, "rewards/margins": 9.76591682434082, "rewards/rejected": -9.920555114746094, "step": 290 }, { "epoch": 0.603656164916375, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 31.611576893068335, "learning_rate": 7.773933102652825e-07, "logits/chosen": 0.1491805762052536, "logits/rejected": 0.1622200310230255, "logps/accuracies": 0.875, "logps/chosen": -324.14556884765625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -316.1439208984375, "logps/ref_rejected": -310.8943176269531, "logps/rejected": -400.73760986328125, "loss": 0.5327, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8001646995544434, "rewards/grad_term": 0.008960644714534283, "rewards/margins": 8.184164047241211, "rewards/rejected": -8.984328269958496, "step": 291 }, { "epoch": 0.6057305847270842, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 40.66113218146233, "learning_rate": 7.76239907727797e-07, "logits/chosen": 0.1667100340127945, "logits/rejected": 0.1031753420829773, "logps/accuracies": 0.875, "logps/chosen": -257.600830078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -251.26223754882812, "logps/ref_rejected": -256.9619445800781, "logps/rejected": -332.1715393066406, "loss": 0.5644, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6338596940040588, "rewards/grad_term": 0.00917502585798502, "rewards/margins": 6.887094974517822, "rewards/rejected": -7.5209550857543945, "step": 292 }, { "epoch": 0.6078050045377933, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 57.97079310383917, "learning_rate": 7.750865051903114e-07, "logits/chosen": -0.08806827664375305, "logits/rejected": -0.045274168252944946, "logps/accuracies": 0.75, "logps/chosen": -308.90509033203125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -310.9958190917969, "logps/ref_rejected": -308.5406494140625, "logps/rejected": -372.4405212402344, "loss": 0.54, "rewards/accuracies": 1.0, "rewards/chosen": 0.20907306671142578, "rewards/grad_term": 0.0065223718993365765, "rewards/margins": 6.599061489105225, "rewards/rejected": -6.389988899230957, "step": 293 }, { "epoch": 0.6098794243485025, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 49.04536258498517, "learning_rate": 7.739331026528259e-07, "logits/chosen": 0.17086654901504517, "logits/rejected": 0.19536878168582916, "logps/accuracies": 0.6875, "logps/chosen": -318.17401123046875, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -322.79205322265625, "logps/ref_rejected": -298.66278076171875, "logps/rejected": -354.1503601074219, "loss": 0.5665, "rewards/accuracies": 0.875, "rewards/chosen": 0.4618016183376312, "rewards/grad_term": 0.015101278200745583, "rewards/margins": 6.010561466217041, "rewards/rejected": -5.548760414123535, "step": 294 }, { "epoch": 0.6119538441592117, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 29.110328186799425, "learning_rate": 7.727797001153403e-07, "logits/chosen": 0.2920646667480469, "logits/rejected": 0.3339766263961792, "logps/accuracies": 0.875, "logps/chosen": -287.7585144042969, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -291.96600341796875, "logps/ref_rejected": -315.6741027832031, "logps/rejected": -371.0801696777344, "loss": 0.5743, "rewards/accuracies": 0.875, "rewards/chosen": 0.4207479953765869, "rewards/grad_term": 0.013844680972397327, "rewards/margins": 5.96135139465332, "rewards/rejected": -5.540602684020996, "step": 295 }, { "epoch": 0.6140282639699209, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 53.230756957534005, "learning_rate": 7.716262975778547e-07, "logits/chosen": 0.12378720194101334, "logits/rejected": 0.16028910875320435, "logps/accuracies": 0.8125, "logps/chosen": -291.1141357421875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -294.4344482421875, "logps/ref_rejected": -296.4821472167969, "logps/rejected": -359.0552062988281, "loss": 0.5617, "rewards/accuracies": 1.0, "rewards/chosen": 0.33203190565109253, "rewards/grad_term": 0.005090603604912758, "rewards/margins": 6.58933687210083, "rewards/rejected": -6.257304668426514, "step": 296 }, { "epoch": 0.6161026837806302, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 49.27245845537937, "learning_rate": 7.704728950403691e-07, "logits/chosen": 0.07910759747028351, "logits/rejected": 0.08938741683959961, "logps/accuracies": 0.8125, "logps/chosen": -336.2572021484375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -347.0129089355469, "logps/ref_rejected": -345.33203125, "logps/rejected": -390.4123229980469, "loss": 0.5452, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0755696296691895, "rewards/grad_term": 0.01720615103840828, "rewards/margins": 5.583600044250488, "rewards/rejected": -4.508030414581299, "step": 297 }, { "epoch": 0.6181771035913393, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 52.587110731751046, "learning_rate": 7.693194925028835e-07, "logits/chosen": 0.09930308163166046, "logits/rejected": 0.21927960216999054, "logps/accuracies": 0.75, "logps/chosen": -221.76551818847656, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -217.18284606933594, "logps/ref_rejected": -224.5638427734375, "logps/rejected": -282.2660217285156, "loss": 0.5868, "rewards/accuracies": 0.875, "rewards/chosen": -0.4582689702510834, "rewards/grad_term": 0.015134723857045174, "rewards/margins": 5.311949253082275, "rewards/rejected": -5.7702178955078125, "step": 298 }, { "epoch": 0.6202515234020485, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 50.13841731685137, "learning_rate": 7.681660899653979e-07, "logits/chosen": 0.05825243890285492, "logits/rejected": 0.1010754331946373, "logps/accuracies": 0.875, "logps/chosen": -372.16961669921875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -380.7333679199219, "logps/ref_rejected": -376.558349609375, "logps/rejected": -447.0321350097656, "loss": 0.4912, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8563790321350098, "rewards/grad_term": 0.006374266929924488, "rewards/margins": 7.903756141662598, "rewards/rejected": -7.047377586364746, "step": 299 }, { "epoch": 0.6223259432127577, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 18.47274167497265, "learning_rate": 7.670126874279122e-07, "logits/chosen": 0.01922018826007843, "logits/rejected": 0.10939830541610718, "logps/accuracies": 0.6875, "logps/chosen": -290.3101806640625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -292.36676025390625, "logps/ref_rejected": -279.2781066894531, "logps/rejected": -335.0788269042969, "loss": 0.5304, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20565718412399292, "rewards/grad_term": 0.012530253268778324, "rewards/margins": 5.785726547241211, "rewards/rejected": -5.580069541931152, "step": 300 }, { "epoch": 0.6244003630234669, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 35.40752217083433, "learning_rate": 7.658592848904267e-07, "logits/chosen": 0.2818371653556824, "logits/rejected": 0.41613805294036865, "logps/accuracies": 0.9375, "logps/chosen": -254.01771545410156, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -242.12171936035156, "logps/ref_rejected": -286.4274597167969, "logps/rejected": -363.48065185546875, "loss": 0.5697, "rewards/accuracies": 0.9375, "rewards/chosen": -1.189597725868225, "rewards/grad_term": 0.010368636809289455, "rewards/margins": 6.51572322845459, "rewards/rejected": -7.705321311950684, "step": 301 }, { "epoch": 0.626474782834176, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 56.807968171838894, "learning_rate": 7.647058823529411e-07, "logits/chosen": 0.23013733327388763, "logits/rejected": 0.2862010598182678, "logps/accuracies": 0.9375, "logps/chosen": -326.96173095703125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -329.29400634765625, "logps/ref_rejected": -335.8630676269531, "logps/rejected": -414.28753662109375, "loss": 0.4896, "rewards/accuracies": 1.0, "rewards/chosen": 0.23322616517543793, "rewards/grad_term": 0.0038325442001223564, "rewards/margins": 8.075675010681152, "rewards/rejected": -7.842449188232422, "step": 302 }, { "epoch": 0.6285492026448852, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 30.936866332774525, "learning_rate": 7.635524798154555e-07, "logits/chosen": 0.38763344287872314, "logits/rejected": 0.42555707693099976, "logps/accuracies": 0.8125, "logps/chosen": -309.457763671875, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -298.310791015625, "logps/ref_rejected": -305.3328857421875, "logps/rejected": -386.14520263671875, "loss": 0.5817, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1147011518478394, "rewards/grad_term": 0.008538071066141129, "rewards/margins": 6.966533660888672, "rewards/rejected": -8.0812349319458, "step": 303 }, { "epoch": 0.6306236224555944, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 39.0549516486327, "learning_rate": 7.623990772779699e-07, "logits/chosen": 0.3378972113132477, "logits/rejected": 0.33350008726119995, "logps/accuracies": 0.75, "logps/chosen": -296.42120361328125, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -286.9034423828125, "logps/ref_rejected": -254.8472137451172, "logps/rejected": -342.2765808105469, "loss": 0.5692, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9517745971679688, "rewards/grad_term": 0.009382160380482674, "rewards/margins": 7.791163921356201, "rewards/rejected": -8.742938995361328, "step": 304 }, { "epoch": 0.6326980422663037, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 25.487656661381987, "learning_rate": 7.612456747404843e-07, "logits/chosen": -0.0026643723249435425, "logits/rejected": 0.13260780274868011, "logps/accuracies": 0.8125, "logps/chosen": -337.88592529296875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -335.2144775390625, "logps/ref_rejected": -367.57647705078125, "logps/rejected": -445.0696716308594, "loss": 0.5336, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2671446204185486, "rewards/grad_term": 0.009766257368028164, "rewards/margins": 7.482178211212158, "rewards/rejected": -7.749322891235352, "step": 305 }, { "epoch": 0.6347724620770129, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 53.78629399737712, "learning_rate": 7.600922722029988e-07, "logits/chosen": 0.17397280037403107, "logits/rejected": 0.13213837146759033, "logps/accuracies": 0.875, "logps/chosen": -296.3286437988281, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -288.9657897949219, "logps/ref_rejected": -271.7525329589844, "logps/rejected": -347.7431640625, "loss": 0.6198, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7362870573997498, "rewards/grad_term": 0.008981076069176197, "rewards/margins": 6.862776756286621, "rewards/rejected": -7.599064826965332, "step": 306 }, { "epoch": 0.636846881887722, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0, "grad_norm": 38.57273088962716, "learning_rate": 7.589388696655133e-07, "logits/chosen": 0.16268330812454224, "logits/rejected": 0.30625462532043457, "logps/accuracies": 1.0, "logps/chosen": -300.149169921875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -308.031982421875, "logps/ref_rejected": -308.461181640625, "logps/rejected": -392.94122314453125, "loss": 0.52, "rewards/accuracies": 1.0, "rewards/chosen": 0.7882769703865051, "rewards/grad_term": 0.0005324217490851879, "rewards/margins": 9.236281394958496, "rewards/rejected": -8.448005676269531, "step": 307 }, { "epoch": 0.6389213016984312, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 14.679022976693457, "learning_rate": 7.577854671280276e-07, "logits/chosen": 0.11276095360517502, "logits/rejected": 0.15488047897815704, "logps/accuracies": 0.75, "logps/chosen": -320.9672546386719, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -317.51580810546875, "logps/ref_rejected": -312.4486999511719, "logps/rejected": -392.2602844238281, "loss": 0.5461, "rewards/accuracies": 1.0, "rewards/chosen": -0.3451465368270874, "rewards/grad_term": 0.005722560919821262, "rewards/margins": 7.636013507843018, "rewards/rejected": -7.9811601638793945, "step": 308 }, { "epoch": 0.6409957215091404, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0, "grad_norm": 44.76031279155951, "learning_rate": 7.566320645905421e-07, "logits/chosen": 0.15411929786205292, "logits/rejected": 0.17396250367164612, "logps/accuracies": 1.0, "logps/chosen": -271.82232666015625, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -267.814697265625, "logps/ref_rejected": -285.9453430175781, "logps/rejected": -360.15850830078125, "loss": 0.5586, "rewards/accuracies": 0.9375, "rewards/chosen": -0.40076228976249695, "rewards/grad_term": 0.009672937914729118, "rewards/margins": 7.020550727844238, "rewards/rejected": -7.421313762664795, "step": 309 }, { "epoch": 0.6430701413198496, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 53.09216498794514, "learning_rate": 7.554786620530565e-07, "logits/chosen": 0.1644693911075592, "logits/rejected": 0.23783330619335175, "logps/accuracies": 0.9375, "logps/chosen": -335.7210693359375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -341.6274719238281, "logps/ref_rejected": -344.434814453125, "logps/rejected": -419.43768310546875, "loss": 0.5501, "rewards/accuracies": 1.0, "rewards/chosen": 0.5906396508216858, "rewards/grad_term": 0.006133922841399908, "rewards/margins": 8.090925216674805, "rewards/rejected": -7.500285625457764, "step": 310 }, { "epoch": 0.6451445611305588, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.1875, "grad_norm": 41.39136172238844, "learning_rate": 7.543252595155709e-07, "logits/chosen": 0.06381943821907043, "logits/rejected": 0.08010812848806381, "logps/accuracies": 0.8125, "logps/chosen": -213.8368682861328, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -210.74017333984375, "logps/ref_rejected": -222.75961303710938, "logps/rejected": -290.5465087890625, "loss": 0.5381, "rewards/accuracies": 0.9375, "rewards/chosen": -0.30966925621032715, "rewards/grad_term": 0.008784075267612934, "rewards/margins": 6.469019889831543, "rewards/rejected": -6.778688907623291, "step": 311 }, { "epoch": 0.647218980941268, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 65.96838712092502, "learning_rate": 7.531718569780853e-07, "logits/chosen": 0.14972330629825592, "logits/rejected": 0.19796700775623322, "logps/accuracies": 0.8125, "logps/chosen": -271.4801940917969, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -271.1293029785156, "logps/ref_rejected": -286.2263488769531, "logps/rejected": -364.43609619140625, "loss": 0.4888, "rewards/accuracies": 0.9375, "rewards/chosen": -0.035090282559394836, "rewards/grad_term": 0.005572815891355276, "rewards/margins": 7.785881042480469, "rewards/rejected": -7.8209710121154785, "step": 312 }, { "epoch": 0.6492934007519772, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 65.98088553626904, "learning_rate": 7.520184544405997e-07, "logits/chosen": 0.14395220577716827, "logits/rejected": 0.09385178238153458, "logps/accuracies": 0.8125, "logps/chosen": -357.7359924316406, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -338.0129089355469, "logps/ref_rejected": -343.6756896972656, "logps/rejected": -418.16510009765625, "loss": 0.563, "rewards/accuracies": 0.9375, "rewards/chosen": -1.972308874130249, "rewards/grad_term": 0.013005997985601425, "rewards/margins": 5.476626873016357, "rewards/rejected": -7.4489359855651855, "step": 313 }, { "epoch": 0.6513678205626864, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 56.93501034330218, "learning_rate": 7.508650519031141e-07, "logits/chosen": 0.13935233652591705, "logits/rejected": 0.19025281071662903, "logps/accuracies": 0.8125, "logps/chosen": -225.3142547607422, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -217.11978149414062, "logps/ref_rejected": -215.02268981933594, "logps/rejected": -280.1806640625, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": -0.8194477558135986, "rewards/grad_term": 0.01098605990409851, "rewards/margins": 5.696350574493408, "rewards/rejected": -6.5157976150512695, "step": 314 }, { "epoch": 0.6534422403733956, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 47.21672943404337, "learning_rate": 7.497116493656286e-07, "logits/chosen": 0.12312566488981247, "logits/rejected": 0.11346716433763504, "logps/accuracies": 0.9375, "logps/chosen": -278.6199951171875, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -275.3497009277344, "logps/ref_rejected": -268.13726806640625, "logps/rejected": -360.0812072753906, "loss": 0.627, "rewards/accuracies": 1.0, "rewards/chosen": -0.32702964544296265, "rewards/grad_term": 0.00423807417973876, "rewards/margins": 8.867365837097168, "rewards/rejected": -9.194396018981934, "step": 315 }, { "epoch": 0.6555166601841047, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 42.73491649478954, "learning_rate": 7.485582468281429e-07, "logits/chosen": 0.12578445672988892, "logits/rejected": 0.11325564980506897, "logps/accuracies": 0.9375, "logps/chosen": -312.9777526855469, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -294.9630432128906, "logps/ref_rejected": -309.66436767578125, "logps/rejected": -403.28887939453125, "loss": 0.5566, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8014687299728394, "rewards/grad_term": 0.006151386070996523, "rewards/margins": 7.560985088348389, "rewards/rejected": -9.36245346069336, "step": 316 }, { "epoch": 0.6575910799948139, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 26.036325870194347, "learning_rate": 7.474048442906574e-07, "logits/chosen": 0.2790083587169647, "logits/rejected": 0.30802425742149353, "logps/accuracies": 0.875, "logps/chosen": -284.39532470703125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -284.0159606933594, "logps/ref_rejected": -297.21063232421875, "logps/rejected": -367.1194763183594, "loss": 0.5027, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03793831914663315, "rewards/grad_term": 0.00753421988338232, "rewards/margins": 6.952947616577148, "rewards/rejected": -6.9908857345581055, "step": 317 }, { "epoch": 0.6596654998055231, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.1875, "grad_norm": 48.848208501371566, "learning_rate": 7.462514417531717e-07, "logits/chosen": 0.23110151290893555, "logits/rejected": 0.23169729113578796, "logps/accuracies": 0.8125, "logps/chosen": -345.29351806640625, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -346.4335632324219, "logps/ref_rejected": -320.21112060546875, "logps/rejected": -389.7298889160156, "loss": 0.5338, "rewards/accuracies": 1.0, "rewards/chosen": 0.11400166153907776, "rewards/grad_term": 0.005073026288300753, "rewards/margins": 7.065882205963135, "rewards/rejected": -6.951880931854248, "step": 318 }, { "epoch": 0.6617399196162324, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 37.43721667408988, "learning_rate": 7.450980392156863e-07, "logits/chosen": -0.11392828822135925, "logits/rejected": -0.1736583262681961, "logps/accuracies": 0.5625, "logps/chosen": -349.2891540527344, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -354.2510986328125, "logps/ref_rejected": -316.1826477050781, "logps/rejected": -374.9449768066406, "loss": 0.5794, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4961950182914734, "rewards/grad_term": 0.01609090529382229, "rewards/margins": 6.372428894042969, "rewards/rejected": -5.87623405456543, "step": 319 }, { "epoch": 0.6638143394269416, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 48.93249687471918, "learning_rate": 7.439446366782007e-07, "logits/chosen": 0.22672495245933533, "logits/rejected": 0.23626157641410828, "logps/accuracies": 0.875, "logps/chosen": -300.6910095214844, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -306.4937744140625, "logps/ref_rejected": -316.8069763183594, "logps/rejected": -371.29150390625, "loss": 0.579, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5802759528160095, "rewards/grad_term": 0.011157616972923279, "rewards/margins": 6.02873420715332, "rewards/rejected": -5.448458194732666, "step": 320 }, { "epoch": 0.6638143394269416, "eval_flips/correct->correct": 0.43842363357543945, "eval_flips/correct->incorrect": 0.004926108289510012, "eval_flips/incorrect->correct": 0.30049261450767517, "eval_flips/incorrect->incorrect": 0.25615763664245605, "eval_logits/chosen": 0.15680116415023804, "eval_logits/rejected": 0.20004509389400482, "eval_logps/accuracies": 0.738916277885437, "eval_logps/chosen": -288.21343994140625, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -336.36444091796875, "eval_loss": 0.6191994547843933, "eval_rewards/accuracies": 0.871921181678772, "eval_rewards/chosen": -0.08623380959033966, "eval_rewards/grad_term": 0.017411047592759132, "eval_rewards/margins": 4.645606994628906, "eval_rewards/rejected": -4.731841087341309, "eval_runtime": 800.1629, "eval_samples_per_second": 2.022, "eval_steps_per_second": 0.254, "step": 320 }, { "epoch": 0.6658887592376507, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 89.14243856965115, "learning_rate": 7.427912341407151e-07, "logits/chosen": 0.25495031476020813, "logits/rejected": 0.36095547676086426, "logps/accuracies": 0.875, "logps/chosen": -296.5644836425781, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -300.39422607421875, "logps/ref_rejected": -350.05474853515625, "logps/rejected": -397.4334716796875, "loss": 0.6239, "rewards/accuracies": 0.9375, "rewards/chosen": 0.38297611474990845, "rewards/grad_term": 0.014660445041954517, "rewards/margins": 5.120844841003418, "rewards/rejected": -4.7378692626953125, "step": 321 }, { "epoch": 0.6679631790483599, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.375, "grad_norm": 16.638863764418918, "learning_rate": 7.416378316032295e-07, "logits/chosen": -0.062116291373968124, "logits/rejected": 0.08867709338665009, "logps/accuracies": 0.625, "logps/chosen": -348.11456298828125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -352.7536315917969, "logps/ref_rejected": -350.36370849609375, "logps/rejected": -394.8106689453125, "loss": 0.5791, "rewards/accuracies": 0.875, "rewards/chosen": 0.4639059007167816, "rewards/grad_term": 0.016094159334897995, "rewards/margins": 4.908601760864258, "rewards/rejected": -4.444696426391602, "step": 322 }, { "epoch": 0.6700375988590691, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.1875, "grad_norm": 69.32056292827173, "learning_rate": 7.404844290657439e-07, "logits/chosen": 0.2704838514328003, "logits/rejected": 0.2706650495529175, "logps/accuracies": 0.75, "logps/chosen": -313.2464599609375, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -315.1943664550781, "logps/ref_rejected": -296.2995300292969, "logps/rejected": -356.75299072265625, "loss": 0.5689, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19478662312030792, "rewards/grad_term": 0.016283176839351654, "rewards/margins": 6.240136623382568, "rewards/rejected": -6.045351028442383, "step": 323 }, { "epoch": 0.6721120186697783, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 53.57049601603343, "learning_rate": 7.393310265282583e-07, "logits/chosen": 0.24967102706432343, "logits/rejected": 0.2552967667579651, "logps/accuracies": 0.8125, "logps/chosen": -270.054443359375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -260.4380187988281, "logps/ref_rejected": -254.31671142578125, "logps/rejected": -320.3769836425781, "loss": 0.5592, "rewards/accuracies": 0.875, "rewards/chosen": -0.9616467952728271, "rewards/grad_term": 0.017603037878870964, "rewards/margins": 5.644383430480957, "rewards/rejected": -6.606029987335205, "step": 324 }, { "epoch": 0.6741864384804875, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 54.49892498689122, "learning_rate": 7.381776239907728e-07, "logits/chosen": 0.22812658548355103, "logits/rejected": 0.2515120506286621, "logps/accuracies": 0.9375, "logps/chosen": -324.39599609375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -317.1215515136719, "logps/ref_rejected": -327.36962890625, "logps/rejected": -387.77032470703125, "loss": 0.5858, "rewards/accuracies": 0.875, "rewards/chosen": -0.7274415493011475, "rewards/grad_term": 0.015195751562714577, "rewards/margins": 5.312624931335449, "rewards/rejected": -6.040066242218018, "step": 325 }, { "epoch": 0.6762608582911966, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 48.53308556944907, "learning_rate": 7.370242214532871e-07, "logits/chosen": -0.008343299850821495, "logits/rejected": -0.03129954636096954, "logps/accuracies": 0.75, "logps/chosen": -356.82958984375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -344.5155029296875, "logps/ref_rejected": -331.8990783691406, "logps/rejected": -418.02227783203125, "loss": 0.5789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2314121723175049, "rewards/grad_term": 0.005676737520843744, "rewards/margins": 7.380904197692871, "rewards/rejected": -8.612316131591797, "step": 326 }, { "epoch": 0.6783352781019059, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.375, "grad_norm": 98.7994855905736, "learning_rate": 7.358708189158016e-07, "logits/chosen": 0.005753070116043091, "logits/rejected": 0.01671770215034485, "logps/accuracies": 0.625, "logps/chosen": -313.5362243652344, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -289.57489013671875, "logps/ref_rejected": -292.51513671875, "logps/rejected": -378.51617431640625, "loss": 0.595, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3961288928985596, "rewards/grad_term": 0.017515743151307106, "rewards/margins": 6.203976631164551, "rewards/rejected": -8.600106239318848, "step": 327 }, { "epoch": 0.6804096979126151, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0, "grad_norm": 46.182216579421194, "learning_rate": 7.347174163783159e-07, "logits/chosen": 0.44519540667533875, "logits/rejected": 0.4516918659210205, "logps/accuracies": 1.0, "logps/chosen": -261.4315490722656, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -263.67852783203125, "logps/ref_rejected": -263.0929260253906, "logps/rejected": -354.0087890625, "loss": 0.5502, "rewards/accuracies": 1.0, "rewards/chosen": 0.22469913959503174, "rewards/grad_term": 0.00040459661977365613, "rewards/margins": 9.316282272338867, "rewards/rejected": -9.091583251953125, "step": 328 }, { "epoch": 0.6824841177233243, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 57.57752345437062, "learning_rate": 7.335640138408304e-07, "logits/chosen": 0.35632041096687317, "logits/rejected": 0.3070759177207947, "logps/accuracies": 0.8125, "logps/chosen": -300.12353515625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -303.2857360839844, "logps/ref_rejected": -288.00970458984375, "logps/rejected": -365.837646484375, "loss": 0.6026, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3162227272987366, "rewards/grad_term": 0.007446423638612032, "rewards/margins": 8.099015235900879, "rewards/rejected": -7.782792568206787, "step": 329 }, { "epoch": 0.6845585375340334, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 53.06794587438548, "learning_rate": 7.324106113033448e-07, "logits/chosen": 0.08217829465866089, "logits/rejected": 0.2244112640619278, "logps/accuracies": 0.9375, "logps/chosen": -315.64288330078125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -322.9136962890625, "logps/ref_rejected": -385.41436767578125, "logps/rejected": -465.26690673828125, "loss": 0.5569, "rewards/accuracies": 1.0, "rewards/chosen": 0.7270775437355042, "rewards/grad_term": 0.004581013694405556, "rewards/margins": 8.71232795715332, "rewards/rejected": -7.985250473022461, "step": 330 }, { "epoch": 0.6866329573447426, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 19.260377458337114, "learning_rate": 7.312572087658593e-07, "logits/chosen": 0.07186198234558105, "logits/rejected": 0.11489441245794296, "logps/accuracies": 0.8125, "logps/chosen": -319.80755615234375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -325.8714599609375, "logps/ref_rejected": -329.986572265625, "logps/rejected": -402.0074462890625, "loss": 0.5508, "rewards/accuracies": 1.0, "rewards/chosen": 0.6063953042030334, "rewards/grad_term": 0.006578431464731693, "rewards/margins": 7.808480739593506, "rewards/rejected": -7.202085971832275, "step": 331 }, { "epoch": 0.6887073771554518, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0, "grad_norm": 17.756201694843515, "learning_rate": 7.301038062283737e-07, "logits/chosen": 0.20243048667907715, "logits/rejected": 0.28428226709365845, "logps/accuracies": 1.0, "logps/chosen": -302.6923828125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -292.7161865234375, "logps/ref_rejected": -302.3647766113281, "logps/rejected": -387.39251708984375, "loss": 0.5204, "rewards/accuracies": 0.875, "rewards/chosen": -0.9976207613945007, "rewards/grad_term": 0.008672392927110195, "rewards/margins": 7.505157470703125, "rewards/rejected": -8.502777099609375, "step": 332 }, { "epoch": 0.690781796966161, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 83.21722843296402, "learning_rate": 7.289504036908881e-07, "logits/chosen": 0.18854957818984985, "logits/rejected": 0.13426542282104492, "logps/accuracies": 0.6875, "logps/chosen": -330.0719909667969, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -328.91522216796875, "logps/ref_rejected": -307.397705078125, "logps/rejected": -382.39984130859375, "loss": 0.549, "rewards/accuracies": 1.0, "rewards/chosen": -0.11567914485931396, "rewards/grad_term": 0.004070833325386047, "rewards/margins": 7.384533882141113, "rewards/rejected": -7.500212669372559, "step": 333 }, { "epoch": 0.6928562167768703, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 41.93819396079496, "learning_rate": 7.277970011534025e-07, "logits/chosen": 0.0038331379182636738, "logits/rejected": 0.06832897663116455, "logps/accuracies": 0.9375, "logps/chosen": -271.9588317871094, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -270.6300964355469, "logps/ref_rejected": -264.27239990234375, "logps/rejected": -333.4945373535156, "loss": 0.5229, "rewards/accuracies": 1.0, "rewards/chosen": -0.13287392258644104, "rewards/grad_term": 0.004513449501246214, "rewards/margins": 6.789344787597656, "rewards/rejected": -6.922219276428223, "step": 334 }, { "epoch": 0.6949306365875794, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 36.44772897953967, "learning_rate": 7.26643598615917e-07, "logits/chosen": 0.08206385374069214, "logits/rejected": 0.15132063627243042, "logps/accuracies": 0.8125, "logps/chosen": -281.35186767578125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -291.0823669433594, "logps/ref_rejected": -298.27886962890625, "logps/rejected": -367.3419494628906, "loss": 0.5405, "rewards/accuracies": 1.0, "rewards/chosen": 0.9730521440505981, "rewards/grad_term": 0.002131945453584194, "rewards/margins": 7.879360198974609, "rewards/rejected": -6.906307697296143, "step": 335 }, { "epoch": 0.6970050563982886, "flips/correct->correct": 0.8125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 28.51646796947631, "learning_rate": 7.254901960784313e-07, "logits/chosen": 0.020516425371170044, "logits/rejected": 0.06293690204620361, "logps/accuracies": 0.9375, "logps/chosen": -308.943359375, "logps/ref_accuracies": 0.8125, "logps/ref_chosen": -308.57928466796875, "logps/ref_rejected": -345.8021545410156, "logps/rejected": -399.0782470703125, "loss": 0.5161, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03641030192375183, "rewards/grad_term": 0.012177910655736923, "rewards/margins": 5.291202068328857, "rewards/rejected": -5.327611923217773, "step": 336 }, { "epoch": 0.6990794762089978, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 34.67482984022363, "learning_rate": 7.243367935409458e-07, "logits/chosen": 0.09479643404483795, "logits/rejected": 0.13485944271087646, "logps/accuracies": 0.8125, "logps/chosen": -350.71783447265625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -344.1441955566406, "logps/ref_rejected": -334.01727294921875, "logps/rejected": -425.11920166015625, "loss": 0.5386, "rewards/accuracies": 1.0, "rewards/chosen": -0.6573646068572998, "rewards/grad_term": 0.005361511372029781, "rewards/margins": 8.452826499938965, "rewards/rejected": -9.110189437866211, "step": 337 }, { "epoch": 0.701153896019707, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 92.66931260479595, "learning_rate": 7.231833910034601e-07, "logits/chosen": 0.16966593265533447, "logits/rejected": 0.13631996512413025, "logps/accuracies": 0.875, "logps/chosen": -345.20770263671875, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -337.16741943359375, "logps/ref_rejected": -317.2330017089844, "logps/rejected": -405.1679992675781, "loss": 0.5231, "rewards/accuracies": 1.0, "rewards/chosen": -0.8040257096290588, "rewards/grad_term": 0.003560500219464302, "rewards/margins": 7.989476203918457, "rewards/rejected": -8.793501853942871, "step": 338 }, { "epoch": 0.7032283158304162, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 28.389402918089036, "learning_rate": 7.220299884659746e-07, "logits/chosen": 0.17944695055484772, "logits/rejected": 0.2976837456226349, "logps/accuracies": 0.875, "logps/chosen": -262.23980712890625, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -265.7380676269531, "logps/ref_rejected": -294.07598876953125, "logps/rejected": -336.75543212890625, "loss": 0.5853, "rewards/accuracies": 0.8125, "rewards/chosen": 0.34982502460479736, "rewards/grad_term": 0.018309494480490685, "rewards/margins": 4.617773532867432, "rewards/rejected": -4.267948150634766, "step": 339 }, { "epoch": 0.7053027356411253, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 33.29343285253659, "learning_rate": 7.20876585928489e-07, "logits/chosen": 0.042180366814136505, "logits/rejected": 0.008254090324044228, "logps/accuracies": 0.75, "logps/chosen": -366.08233642578125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -362.78900146484375, "logps/ref_rejected": -327.56585693359375, "logps/rejected": -406.1036071777344, "loss": 0.5219, "rewards/accuracies": 1.0, "rewards/chosen": -0.3293338716030121, "rewards/grad_term": 0.006429283879697323, "rewards/margins": 7.524442672729492, "rewards/rejected": -7.8537774085998535, "step": 340 }, { "epoch": 0.7073771554518345, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 38.22067938349734, "learning_rate": 7.197231833910034e-07, "logits/chosen": 0.1745857149362564, "logits/rejected": 0.22055479884147644, "logps/accuracies": 0.75, "logps/chosen": -223.20620727539062, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -214.50759887695312, "logps/ref_rejected": -225.8865509033203, "logps/rejected": -280.5146179199219, "loss": 0.5904, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8698611259460449, "rewards/grad_term": 0.020344514399766922, "rewards/margins": 4.592945098876953, "rewards/rejected": -5.46280574798584, "step": 341 }, { "epoch": 0.7094515752625438, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 29.038636096202257, "learning_rate": 7.185697808535178e-07, "logits/chosen": 0.14953972399234772, "logits/rejected": 0.15320980548858643, "logps/accuracies": 0.9375, "logps/chosen": -289.46533203125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -288.3252868652344, "logps/ref_rejected": -298.48101806640625, "logps/rejected": -359.2103576660156, "loss": 0.6004, "rewards/accuracies": 0.75, "rewards/chosen": -0.11400707066059113, "rewards/grad_term": 0.022022824734449387, "rewards/margins": 5.958928108215332, "rewards/rejected": -6.072935104370117, "step": 342 }, { "epoch": 0.711525995073253, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 33.00355372540172, "learning_rate": 7.174163783160324e-07, "logits/chosen": -0.06829185783863068, "logits/rejected": -0.009546427056193352, "logps/accuracies": 0.8125, "logps/chosen": -352.6844177246094, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -354.55364990234375, "logps/ref_rejected": -351.40203857421875, "logps/rejected": -413.4746398925781, "loss": 0.5504, "rewards/accuracies": 0.875, "rewards/chosen": 0.18692292273044586, "rewards/grad_term": 0.01500310655683279, "rewards/margins": 6.394184112548828, "rewards/rejected": -6.207261085510254, "step": 343 }, { "epoch": 0.7136004148839621, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 29.545604358917398, "learning_rate": 7.162629757785467e-07, "logits/chosen": 0.21840424835681915, "logits/rejected": 0.3245609402656555, "logps/accuracies": 0.9375, "logps/chosen": -289.7184143066406, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -281.7314758300781, "logps/ref_rejected": -305.2322082519531, "logps/rejected": -371.3037414550781, "loss": 0.5546, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7986934185028076, "rewards/grad_term": 0.02135634422302246, "rewards/margins": 5.8084611892700195, "rewards/rejected": -6.607154369354248, "step": 344 }, { "epoch": 0.7156748346946713, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 18.685102213495963, "learning_rate": 7.151095732410612e-07, "logits/chosen": 0.33599621057510376, "logits/rejected": 0.259542852640152, "logps/accuracies": 0.8125, "logps/chosen": -297.86285400390625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -309.85284423828125, "logps/ref_rejected": -319.9841613769531, "logps/rejected": -364.2496643066406, "loss": 0.5428, "rewards/accuracies": 1.0, "rewards/chosen": 1.1990000009536743, "rewards/grad_term": 0.012635907158255577, "rewards/margins": 5.625548839569092, "rewards/rejected": -4.426548957824707, "step": 345 }, { "epoch": 0.7177492545053805, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 51.73399077401343, "learning_rate": 7.139561707035755e-07, "logits/chosen": 0.19686900079250336, "logits/rejected": 0.2250552475452423, "logps/accuracies": 0.875, "logps/chosen": -334.487548828125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -344.34234619140625, "logps/ref_rejected": -351.153564453125, "logps/rejected": -423.1881103515625, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9854794144630432, "rewards/grad_term": 0.0029634374659508467, "rewards/margins": 8.188934326171875, "rewards/rejected": -7.203455924987793, "step": 346 }, { "epoch": 0.7198236743160897, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 36.992429397717665, "learning_rate": 7.1280276816609e-07, "logits/chosen": 0.3227195143699646, "logits/rejected": 0.3476618230342865, "logps/accuracies": 0.8125, "logps/chosen": -283.4656066894531, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -281.3516540527344, "logps/ref_rejected": -273.4134826660156, "logps/rejected": -333.3074645996094, "loss": 0.5596, "rewards/accuracies": 0.9375, "rewards/chosen": -0.21139609813690186, "rewards/grad_term": 0.013419999741017818, "rewards/margins": 5.778001308441162, "rewards/rejected": -5.9893975257873535, "step": 347 }, { "epoch": 0.721898094126799, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 41.16834515003933, "learning_rate": 7.116493656286043e-07, "logits/chosen": 0.125450000166893, "logits/rejected": 0.17324930429458618, "logps/accuracies": 0.8125, "logps/chosen": -296.0699157714844, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -281.47979736328125, "logps/ref_rejected": -311.7028503417969, "logps/rejected": -370.5085144042969, "loss": 0.5877, "rewards/accuracies": 0.75, "rewards/chosen": -1.4590116739273071, "rewards/grad_term": 0.020582564175128937, "rewards/margins": 4.421552658081055, "rewards/rejected": -5.8805646896362305, "step": 348 }, { "epoch": 0.723972513937508, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 28.04679377044801, "learning_rate": 7.104959630911188e-07, "logits/chosen": 0.02426442876458168, "logits/rejected": 0.030082188546657562, "logps/accuracies": 0.875, "logps/chosen": -329.852294921875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -335.4429016113281, "logps/ref_rejected": -336.506103515625, "logps/rejected": -406.5826416015625, "loss": 0.5545, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5590592622756958, "rewards/grad_term": 0.007747030816972256, "rewards/margins": 7.566709995269775, "rewards/rejected": -7.007650852203369, "step": 349 }, { "epoch": 0.7260469337482173, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0, "grad_norm": 33.4741646395165, "learning_rate": 7.093425605536332e-07, "logits/chosen": 0.014736661687493324, "logits/rejected": 0.02637672983109951, "logps/accuracies": 1.0, "logps/chosen": -317.66864013671875, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -317.7265930175781, "logps/ref_rejected": -342.7004699707031, "logps/rejected": -423.1243896484375, "loss": 0.5797, "rewards/accuracies": 1.0, "rewards/chosen": 0.005797684192657471, "rewards/grad_term": 0.006446592975407839, "rewards/margins": 8.048192977905273, "rewards/rejected": -8.04239559173584, "step": 350 }, { "epoch": 0.7281213535589265, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 26.26791550365828, "learning_rate": 7.081891580161476e-07, "logits/chosen": -0.018278811126947403, "logits/rejected": -0.055383071303367615, "logps/accuracies": 0.875, "logps/chosen": -334.0198974609375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -332.0812683105469, "logps/ref_rejected": -322.0040588378906, "logps/rejected": -413.3955078125, "loss": 0.5075, "rewards/accuracies": 0.875, "rewards/chosen": -0.19386102259159088, "rewards/grad_term": 0.009265870787203312, "rewards/margins": 8.945282936096191, "rewards/rejected": -9.139144897460938, "step": 351 }, { "epoch": 0.7301957733696357, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 31.547111088022394, "learning_rate": 7.07035755478662e-07, "logits/chosen": 0.07085268199443817, "logits/rejected": 0.11351241916418076, "logps/accuracies": 0.875, "logps/chosen": -323.64910888671875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -324.082275390625, "logps/ref_rejected": -336.0316467285156, "logps/rejected": -416.46917724609375, "loss": 0.5288, "rewards/accuracies": 1.0, "rewards/chosen": 0.04331676661968231, "rewards/grad_term": 0.003617448266595602, "rewards/margins": 8.087069511413574, "rewards/rejected": -8.043752670288086, "step": 352 }, { "epoch": 0.7322701931803449, "flips/correct->correct": 0.1875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.25, "grad_norm": 44.5849999823685, "learning_rate": 7.058823529411765e-07, "logits/chosen": 0.3111583888530731, "logits/rejected": 0.2975752055644989, "logps/accuracies": 0.75, "logps/chosen": -315.43798828125, "logps/ref_accuracies": 0.1875, "logps/ref_chosen": -307.661865234375, "logps/ref_rejected": -282.1680908203125, "logps/rejected": -370.3470153808594, "loss": 0.5453, "rewards/accuracies": 0.9375, "rewards/chosen": -0.777613639831543, "rewards/grad_term": 0.010531319305300713, "rewards/margins": 8.040277481079102, "rewards/rejected": -8.817892074584961, "step": 353 }, { "epoch": 0.734344612991054, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 42.86231892921175, "learning_rate": 7.047289504036908e-07, "logits/chosen": 0.2585771083831787, "logits/rejected": 0.3336886465549469, "logps/accuracies": 0.9375, "logps/chosen": -327.9022216796875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -332.4975891113281, "logps/ref_rejected": -343.6222839355469, "logps/rejected": -417.87591552734375, "loss": 0.5632, "rewards/accuracies": 1.0, "rewards/chosen": 0.459539532661438, "rewards/grad_term": 0.006328054238110781, "rewards/margins": 7.8848958015441895, "rewards/rejected": -7.425356864929199, "step": 354 }, { "epoch": 0.7364190328017632, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 38.37684555724554, "learning_rate": 7.035755478662053e-07, "logits/chosen": 0.07023249566555023, "logits/rejected": 0.09015891700983047, "logps/accuracies": 0.8125, "logps/chosen": -336.48968505859375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -324.7945861816406, "logps/ref_rejected": -318.7522888183594, "logps/rejected": -398.0233154296875, "loss": 0.6105, "rewards/accuracies": 0.875, "rewards/chosen": -1.1695095300674438, "rewards/grad_term": 0.01756344363093376, "rewards/margins": 6.757594585418701, "rewards/rejected": -7.927104473114014, "step": 355 }, { "epoch": 0.7384934526124725, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 35.08769051586877, "learning_rate": 7.024221453287197e-07, "logits/chosen": 0.07610762119293213, "logits/rejected": 0.18170149624347687, "logps/accuracies": 0.875, "logps/chosen": -259.91943359375, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -262.623779296875, "logps/ref_rejected": -292.6954040527344, "logps/rejected": -350.150146484375, "loss": 0.5485, "rewards/accuracies": 1.0, "rewards/chosen": 0.27043378353118896, "rewards/grad_term": 0.011945006437599659, "rewards/margins": 6.015911102294922, "rewards/rejected": -5.745476722717285, "step": 356 }, { "epoch": 0.7405678724231817, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 26.590182989230307, "learning_rate": 7.012687427912342e-07, "logits/chosen": -0.002248242497444153, "logits/rejected": 0.0742294117808342, "logps/accuracies": 0.8125, "logps/chosen": -248.39425659179688, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -245.6069793701172, "logps/ref_rejected": -279.5992736816406, "logps/rejected": -343.59112548828125, "loss": 0.522, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2787279188632965, "rewards/grad_term": 0.017660701647400856, "rewards/margins": 6.120457172393799, "rewards/rejected": -6.399184703826904, "step": 357 }, { "epoch": 0.7426422922338909, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.25, "grad_norm": 29.97128814142722, "learning_rate": 7.001153402537486e-07, "logits/chosen": 0.05706937611103058, "logits/rejected": 0.20207172632217407, "logps/accuracies": 0.75, "logps/chosen": -336.1678771972656, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -342.0132751464844, "logps/ref_rejected": -353.97802734375, "logps/rejected": -400.2590637207031, "loss": 0.4873, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5845356583595276, "rewards/grad_term": 0.012262849137187004, "rewards/margins": 5.212644577026367, "rewards/rejected": -4.628108978271484, "step": 358 }, { "epoch": 0.7447167120446, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 63.41932463356418, "learning_rate": 6.98961937716263e-07, "logits/chosen": 0.21743306517601013, "logits/rejected": 0.2811052203178406, "logps/accuracies": 0.75, "logps/chosen": -293.9018249511719, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -303.6950988769531, "logps/ref_rejected": -292.83917236328125, "logps/rejected": -354.6083068847656, "loss": 0.5279, "rewards/accuracies": 1.0, "rewards/chosen": 0.9793245792388916, "rewards/grad_term": 0.009044105187058449, "rewards/margins": 7.156236171722412, "rewards/rejected": -6.176911354064941, "step": 359 }, { "epoch": 0.7467911318553092, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 89.9026831204858, "learning_rate": 6.978085351787774e-07, "logits/chosen": 0.40088769793510437, "logits/rejected": 0.40543943643569946, "logps/accuracies": 0.8125, "logps/chosen": -255.18496704101562, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -259.64080810546875, "logps/ref_rejected": -265.6492919921875, "logps/rejected": -336.0693359375, "loss": 0.5575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44558367133140564, "rewards/grad_term": 0.01041356474161148, "rewards/margins": 7.487587928771973, "rewards/rejected": -7.042004108428955, "step": 360 }, { "epoch": 0.7488655516660184, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 22.748796947859415, "learning_rate": 6.966551326412918e-07, "logits/chosen": 0.31769663095474243, "logits/rejected": 0.3735862970352173, "logps/accuracies": 0.8125, "logps/chosen": -305.60565185546875, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -306.97216796875, "logps/ref_rejected": -295.53271484375, "logps/rejected": -375.6974182128906, "loss": 0.5312, "rewards/accuracies": 1.0, "rewards/chosen": 0.1366504728794098, "rewards/grad_term": 0.003756206249818206, "rewards/margins": 8.153119087219238, "rewards/rejected": -8.016468048095703, "step": 361 }, { "epoch": 0.7509399714767276, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 48.649828270109175, "learning_rate": 6.955017301038062e-07, "logits/chosen": -0.11286991089582443, "logits/rejected": -0.08522382378578186, "logps/accuracies": 0.6875, "logps/chosen": -306.65692138671875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -299.77130126953125, "logps/ref_rejected": -305.17205810546875, "logps/rejected": -366.79888916015625, "loss": 0.563, "rewards/accuracies": 0.875, "rewards/chosen": -0.688561201095581, "rewards/grad_term": 0.015329258516430855, "rewards/margins": 5.474117279052734, "rewards/rejected": -6.162679195404053, "step": 362 }, { "epoch": 0.7530143912874367, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 44.582697450921174, "learning_rate": 6.943483275663207e-07, "logits/chosen": 0.16607432067394257, "logits/rejected": 0.19322986900806427, "logps/accuracies": 0.9375, "logps/chosen": -250.20355224609375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -252.611572265625, "logps/ref_rejected": -272.4234924316406, "logps/rejected": -353.1186218261719, "loss": 0.5006, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24080336093902588, "rewards/grad_term": 0.006381358951330185, "rewards/margins": 8.310314178466797, "rewards/rejected": -8.069511413574219, "step": 363 }, { "epoch": 0.755088811098146, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 34.80591547675486, "learning_rate": 6.93194925028835e-07, "logits/chosen": 0.19473493099212646, "logits/rejected": 0.18632598221302032, "logps/accuracies": 0.8125, "logps/chosen": -256.6549987792969, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -255.31813049316406, "logps/ref_rejected": -249.58592224121094, "logps/rejected": -313.6597900390625, "loss": 0.5498, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13368618488311768, "rewards/grad_term": 0.011986999772489071, "rewards/margins": 6.273699760437012, "rewards/rejected": -6.407385349273682, "step": 364 }, { "epoch": 0.7571632309088552, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 41.34316036796179, "learning_rate": 6.920415224913494e-07, "logits/chosen": 0.1397414356470108, "logits/rejected": 0.23811021447181702, "logps/accuracies": 0.9375, "logps/chosen": -315.8439636230469, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -314.4014892578125, "logps/ref_rejected": -363.3650817871094, "logps/rejected": -422.4331359863281, "loss": 0.5638, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14424461126327515, "rewards/grad_term": 0.012570216320455074, "rewards/margins": 5.762563228607178, "rewards/rejected": -5.906806945800781, "step": 365 }, { "epoch": 0.7592376507195644, "flips/correct->correct": 0.1875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.75, "flips/incorrect->incorrect": 0.0625, "grad_norm": 23.772256648413755, "learning_rate": 6.908881199538638e-07, "logits/chosen": 0.05051745846867561, "logits/rejected": -0.02119167149066925, "logps/accuracies": 0.9375, "logps/chosen": -253.32345581054688, "logps/ref_accuracies": 0.1875, "logps/ref_chosen": -254.56048583984375, "logps/ref_rejected": -240.97332763671875, "logps/rejected": -324.1744079589844, "loss": 0.5553, "rewards/accuracies": 1.0, "rewards/chosen": 0.12370094656944275, "rewards/grad_term": 0.0012932950630784035, "rewards/margins": 8.443807601928711, "rewards/rejected": -8.320106506347656, "step": 366 }, { "epoch": 0.7613120705302736, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 38.69831479632815, "learning_rate": 6.897347174163782e-07, "logits/chosen": 0.12788856029510498, "logits/rejected": 0.16543559730052948, "logps/accuracies": 0.9375, "logps/chosen": -277.533447265625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -259.1481628417969, "logps/ref_rejected": -260.858154296875, "logps/rejected": -343.05718994140625, "loss": 0.5869, "rewards/accuracies": 0.9375, "rewards/chosen": -1.838526725769043, "rewards/grad_term": 0.012527183629572392, "rewards/margins": 6.381375789642334, "rewards/rejected": -8.219902992248535, "step": 367 }, { "epoch": 0.7633864903409827, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0, "grad_norm": 35.55521896514245, "learning_rate": 6.885813148788927e-07, "logits/chosen": 0.15278108417987823, "logits/rejected": 0.14702126383781433, "logps/accuracies": 1.0, "logps/chosen": -257.01214599609375, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -247.90902709960938, "logps/ref_rejected": -265.93560791015625, "logps/rejected": -346.7325134277344, "loss": 0.5451, "rewards/accuracies": 0.875, "rewards/chosen": -0.9103094339370728, "rewards/grad_term": 0.011162678711116314, "rewards/margins": 7.169381618499756, "rewards/rejected": -8.079690933227539, "step": 368 }, { "epoch": 0.7654609101516919, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 30.230401166275122, "learning_rate": 6.874279123414071e-07, "logits/chosen": 0.086149662733078, "logits/rejected": 0.22511181235313416, "logps/accuracies": 0.875, "logps/chosen": -233.25259399414062, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -216.24533081054688, "logps/ref_rejected": -227.90420532226562, "logps/rejected": -305.6214599609375, "loss": 0.569, "rewards/accuracies": 0.875, "rewards/chosen": -1.7007248401641846, "rewards/grad_term": 0.013076627627015114, "rewards/margins": 6.071000099182129, "rewards/rejected": -7.771725177764893, "step": 369 }, { "epoch": 0.7675353299624011, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 52.22387916464415, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.2996848225593567, "logits/rejected": 0.31599316000938416, "logps/accuracies": 0.875, "logps/chosen": -269.8756408691406, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -261.0709533691406, "logps/ref_rejected": -246.64968872070312, "logps/rejected": -322.8077392578125, "loss": 0.5479, "rewards/accuracies": 0.875, "rewards/chosen": -0.8804708123207092, "rewards/grad_term": 0.01271949615329504, "rewards/margins": 6.735333442687988, "rewards/rejected": -7.615804195404053, "step": 370 }, { "epoch": 0.7696097497731104, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.125, "grad_norm": 63.7741156365506, "learning_rate": 6.851211072664359e-07, "logits/chosen": -0.01846727915108204, "logits/rejected": -0.008946547284722328, "logps/accuracies": 0.875, "logps/chosen": -305.61639404296875, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -300.2716064453125, "logps/ref_rejected": -331.55859375, "logps/rejected": -386.4937438964844, "loss": 0.5752, "rewards/accuracies": 0.75, "rewards/chosen": -0.5344789624214172, "rewards/grad_term": 0.02246464043855667, "rewards/margins": 4.959036350250244, "rewards/rejected": -5.4935150146484375, "step": 371 }, { "epoch": 0.7716841695838196, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 64.45685095510555, "learning_rate": 6.839677047289504e-07, "logits/chosen": 0.08011619746685028, "logits/rejected": 0.09146730601787567, "logps/accuracies": 0.8125, "logps/chosen": -345.562255859375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -346.65069580078125, "logps/ref_rejected": -352.12396240234375, "logps/rejected": -403.227294921875, "loss": 0.572, "rewards/accuracies": 1.0, "rewards/chosen": 0.10884669423103333, "rewards/grad_term": 0.013191865757107735, "rewards/margins": 5.219181537628174, "rewards/rejected": -5.110335350036621, "step": 372 }, { "epoch": 0.7737585893945287, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 39.59306558130555, "learning_rate": 6.828143021914648e-07, "logits/chosen": -0.12652695178985596, "logits/rejected": -0.07933872938156128, "logps/accuracies": 0.8125, "logps/chosen": -299.42108154296875, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -306.9295654296875, "logps/ref_rejected": -296.8434143066406, "logps/rejected": -359.88623046875, "loss": 0.5548, "rewards/accuracies": 1.0, "rewards/chosen": 0.7508491277694702, "rewards/grad_term": 0.005514204967767, "rewards/margins": 7.0551300048828125, "rewards/rejected": -6.304280757904053, "step": 373 }, { "epoch": 0.7758330092052379, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 76.23798525574978, "learning_rate": 6.816608996539792e-07, "logits/chosen": 0.12230158597230911, "logits/rejected": 0.12023597955703735, "logps/accuracies": 0.75, "logps/chosen": -281.0087890625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -282.221923828125, "logps/ref_rejected": -290.686767578125, "logps/rejected": -355.41357421875, "loss": 0.603, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1213146299123764, "rewards/grad_term": 0.011165942065417767, "rewards/margins": 6.593995571136475, "rewards/rejected": -6.472680568695068, "step": 374 }, { "epoch": 0.7779074290159471, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.125, "grad_norm": 34.99314645928879, "learning_rate": 6.805074971164936e-07, "logits/chosen": 0.15410278737545013, "logits/rejected": 0.2643253803253174, "logps/accuracies": 0.875, "logps/chosen": -305.5322265625, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -313.14080810546875, "logps/ref_rejected": -341.4920654296875, "logps/rejected": -399.58892822265625, "loss": 0.6245, "rewards/accuracies": 1.0, "rewards/chosen": 0.7608582377433777, "rewards/grad_term": 0.008731910958886147, "rewards/margins": 6.570548057556152, "rewards/rejected": -5.809689998626709, "step": 375 }, { "epoch": 0.7799818488266563, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 54.1532317962664, "learning_rate": 6.79354094579008e-07, "logits/chosen": 0.3877769708633423, "logits/rejected": 0.35827726125717163, "logps/accuracies": 0.6875, "logps/chosen": -262.630126953125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -267.2193298339844, "logps/ref_rejected": -227.17079162597656, "logps/rejected": -268.9840087890625, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 0.45892155170440674, "rewards/grad_term": 0.014158925041556358, "rewards/margins": 4.640246391296387, "rewards/rejected": -4.1813249588012695, "step": 376 }, { "epoch": 0.7820562686373654, "flips/correct->correct": 0.1875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.3125, "grad_norm": 29.1706749528385, "learning_rate": 6.782006920415224e-07, "logits/chosen": 0.33914873003959656, "logits/rejected": 0.2858618199825287, "logps/accuracies": 0.6875, "logps/chosen": -333.1984558105469, "logps/ref_accuracies": 0.1875, "logps/ref_chosen": -334.16839599609375, "logps/ref_rejected": -299.3583679199219, "logps/rejected": -365.6182556152344, "loss": 0.6119, "rewards/accuracies": 0.875, "rewards/chosen": 0.0969928503036499, "rewards/grad_term": 0.011358851566910744, "rewards/margins": 6.722982883453369, "rewards/rejected": -6.62598991394043, "step": 377 }, { "epoch": 0.7841306884480747, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 84.54685393998759, "learning_rate": 6.770472895040369e-07, "logits/chosen": 0.24667781591415405, "logits/rejected": 0.2804810106754303, "logps/accuracies": 0.6875, "logps/chosen": -298.77734375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -290.1251220703125, "logps/ref_rejected": -294.4027099609375, "logps/rejected": -353.4877014160156, "loss": 0.5833, "rewards/accuracies": 0.875, "rewards/chosen": -0.8652223348617554, "rewards/grad_term": 0.013081303797662258, "rewards/margins": 5.043279647827148, "rewards/rejected": -5.908501625061035, "step": 378 }, { "epoch": 0.7862051082587839, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 70.28746618838973, "learning_rate": 6.758938869665512e-07, "logits/chosen": 0.13181552290916443, "logits/rejected": 0.20094197988510132, "logps/accuracies": 0.875, "logps/chosen": -299.0951843261719, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -298.9969177246094, "logps/ref_rejected": -297.56201171875, "logps/rejected": -363.7420349121094, "loss": 0.5688, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009829364717006683, "rewards/grad_term": 0.010300719179213047, "rewards/margins": 6.60817289352417, "rewards/rejected": -6.618002414703369, "step": 379 }, { "epoch": 0.7882795280694931, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 36.661344739750994, "learning_rate": 6.747404844290657e-07, "logits/chosen": 0.11381202936172485, "logits/rejected": 0.27916306257247925, "logps/accuracies": 0.75, "logps/chosen": -311.8406982421875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -300.5757141113281, "logps/ref_rejected": -303.3941345214844, "logps/rejected": -374.7430419921875, "loss": 0.5351, "rewards/accuracies": 0.875, "rewards/chosen": -1.1265006065368652, "rewards/grad_term": 0.013736705295741558, "rewards/margins": 6.008389949798584, "rewards/rejected": -7.134890556335449, "step": 380 }, { "epoch": 0.7903539478802023, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 52.40416785212294, "learning_rate": 6.735870818915801e-07, "logits/chosen": 0.28354066610336304, "logits/rejected": 0.3793669044971466, "logps/accuracies": 0.9375, "logps/chosen": -251.4017333984375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -241.32391357421875, "logps/ref_rejected": -268.19512939453125, "logps/rejected": -346.08892822265625, "loss": 0.5578, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0077815055847168, "rewards/grad_term": 0.0052658445201814175, "rewards/margins": 6.781601905822754, "rewards/rejected": -7.789383411407471, "step": 381 }, { "epoch": 0.7924283676909114, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 30.49757789797264, "learning_rate": 6.724336793540946e-07, "logits/chosen": 0.4262790381908417, "logits/rejected": 0.44936031103134155, "logps/accuracies": 0.875, "logps/chosen": -276.0723571777344, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -259.67919921875, "logps/ref_rejected": -269.11407470703125, "logps/rejected": -344.3211364746094, "loss": 0.5833, "rewards/accuracies": 0.9375, "rewards/chosen": -1.639316201210022, "rewards/grad_term": 0.017718428745865822, "rewards/margins": 5.881390571594238, "rewards/rejected": -7.520707130432129, "step": 382 }, { "epoch": 0.7945027875016206, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 32.850204291988575, "learning_rate": 6.71280276816609e-07, "logits/chosen": 0.41091158986091614, "logits/rejected": 0.46820542216300964, "logps/accuracies": 0.75, "logps/chosen": -324.7238464355469, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -312.0603942871094, "logps/ref_rejected": -325.2768859863281, "logps/rejected": -401.4980773925781, "loss": 0.6727, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2663447856903076, "rewards/grad_term": 0.01295191328972578, "rewards/margins": 6.35577392578125, "rewards/rejected": -7.622118949890137, "step": 383 }, { "epoch": 0.7965772073123298, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.4375, "grad_norm": 40.68382714512231, "learning_rate": 6.701268742791234e-07, "logits/chosen": -0.04504679515957832, "logits/rejected": -0.05939174070954323, "logps/accuracies": 0.5, "logps/chosen": -364.9970703125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -330.1788635253906, "logps/ref_rejected": -310.0872497558594, "logps/rejected": -393.5850830078125, "loss": 0.6752, "rewards/accuracies": 0.9375, "rewards/chosen": -3.481823444366455, "rewards/grad_term": 0.014739114791154861, "rewards/margins": 4.8679633140563965, "rewards/rejected": -8.349786758422852, "step": 384 }, { "epoch": 0.7965772073123298, "eval_flips/correct->correct": 0.4433497488498688, "eval_flips/correct->incorrect": 0.0, "eval_flips/incorrect->correct": 0.3497537076473236, "eval_flips/incorrect->incorrect": 0.2068965584039688, "eval_logits/chosen": 0.1350509524345398, "eval_logits/rejected": 0.17706024646759033, "eval_logps/accuracies": 0.7931034564971924, "eval_logps/chosen": -310.3870544433594, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -369.9229431152344, "eval_loss": 0.6723487973213196, "eval_rewards/accuracies": 0.9261083602905273, "eval_rewards/chosen": -2.3035953044891357, "eval_rewards/grad_term": 0.011555198580026627, "eval_rewards/margins": 5.784095287322998, "eval_rewards/rejected": -8.087691307067871, "eval_runtime": 804.6111, "eval_samples_per_second": 2.011, "eval_steps_per_second": 0.252, "step": 384 }, { "epoch": 0.798651627123039, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 50.762610162394786, "learning_rate": 6.689734717416378e-07, "logits/chosen": -0.012154202908277512, "logits/rejected": 0.0032455138862133026, "logps/accuracies": 0.8125, "logps/chosen": -352.6640930175781, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -328.68597412109375, "logps/ref_rejected": -300.6054992675781, "logps/rejected": -397.1305847167969, "loss": 0.6399, "rewards/accuracies": 1.0, "rewards/chosen": -2.3978145122528076, "rewards/grad_term": 0.0043524750508368015, "rewards/margins": 7.254694938659668, "rewards/rejected": -9.652509689331055, "step": 385 }, { "epoch": 0.8007260469337483, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 95.05673688341686, "learning_rate": 6.678200692041522e-07, "logits/chosen": 0.21177135407924652, "logits/rejected": 0.23154297471046448, "logps/accuracies": 0.875, "logps/chosen": -336.6234130859375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -302.63336181640625, "logps/ref_rejected": -325.64202880859375, "logps/rejected": -405.45782470703125, "loss": 0.6303, "rewards/accuracies": 0.9375, "rewards/chosen": -3.399005174636841, "rewards/grad_term": 0.015561016276478767, "rewards/margins": 4.582573413848877, "rewards/rejected": -7.981578826904297, "step": 386 }, { "epoch": 0.8028004667444574, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.4375, "grad_norm": 98.7595748405997, "learning_rate": 6.666666666666666e-07, "logits/chosen": -0.1825534999370575, "logits/rejected": -0.13728323578834534, "logps/accuracies": 0.5625, "logps/chosen": -279.4841613769531, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -266.027099609375, "logps/ref_rejected": -238.2875518798828, "logps/rejected": -310.189453125, "loss": 0.6502, "rewards/accuracies": 0.875, "rewards/chosen": -1.345707893371582, "rewards/grad_term": 0.013503390364348888, "rewards/margins": 5.844482421875, "rewards/rejected": -7.19019079208374, "step": 387 }, { "epoch": 0.8048748865551666, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 58.11377216377975, "learning_rate": 6.655132641291811e-07, "logits/chosen": 0.20901203155517578, "logits/rejected": 0.19806969165802002, "logps/accuracies": 0.9375, "logps/chosen": -327.6300354003906, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -298.32843017578125, "logps/ref_rejected": -294.7974853515625, "logps/rejected": -393.97967529296875, "loss": 0.6463, "rewards/accuracies": 1.0, "rewards/chosen": -2.9301586151123047, "rewards/grad_term": 0.0037229093722999096, "rewards/margins": 6.988059997558594, "rewards/rejected": -9.918218612670898, "step": 388 }, { "epoch": 0.8069493063658758, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 23.205827934491982, "learning_rate": 6.643598615916954e-07, "logits/chosen": 0.18306072056293488, "logits/rejected": 0.23532596230506897, "logps/accuracies": 0.875, "logps/chosen": -248.78665161132812, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -254.37738037109375, "logps/ref_rejected": -251.6569061279297, "logps/rejected": -319.8973388671875, "loss": 0.5127, "rewards/accuracies": 1.0, "rewards/chosen": 0.5590727925300598, "rewards/grad_term": 0.001746954396367073, "rewards/margins": 7.383120536804199, "rewards/rejected": -6.824047088623047, "step": 389 }, { "epoch": 0.809023726176585, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 46.61067138619315, "learning_rate": 6.632064590542099e-07, "logits/chosen": 0.16985514760017395, "logits/rejected": 0.16642533242702484, "logps/accuracies": 0.9375, "logps/chosen": -340.10723876953125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -344.4974670410156, "logps/ref_rejected": -371.35791015625, "logps/rejected": -436.1505432128906, "loss": 0.5613, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4390239119529724, "rewards/grad_term": 0.007462616544216871, "rewards/margins": 6.918284893035889, "rewards/rejected": -6.47926139831543, "step": 390 }, { "epoch": 0.8110981459872941, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 57.80966589693944, "learning_rate": 6.620530565167242e-07, "logits/chosen": -0.10273560136556625, "logits/rejected": -0.0613471083343029, "logps/accuracies": 0.75, "logps/chosen": -216.7875518798828, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -226.03538513183594, "logps/ref_rejected": -221.71621704101562, "logps/rejected": -261.552978515625, "loss": 0.5161, "rewards/accuracies": 0.875, "rewards/chosen": 0.9247859120368958, "rewards/grad_term": 0.0170612595975399, "rewards/margins": 4.9084649085998535, "rewards/rejected": -3.9836790561676025, "step": 391 }, { "epoch": 0.8131725657980033, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.375, "grad_norm": 62.6241872293078, "learning_rate": 6.608996539792387e-07, "logits/chosen": 0.22374431788921356, "logits/rejected": 0.22435928881168365, "logps/accuracies": 0.625, "logps/chosen": -285.15252685546875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -289.751220703125, "logps/ref_rejected": -287.3389587402344, "logps/rejected": -332.70794677734375, "loss": 0.5729, "rewards/accuracies": 1.0, "rewards/chosen": 0.4598711133003235, "rewards/grad_term": 0.01684059388935566, "rewards/margins": 4.996764659881592, "rewards/rejected": -4.536893844604492, "step": 392 }, { "epoch": 0.8152469856087126, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 104.53799642819773, "learning_rate": 6.597462514417531e-07, "logits/chosen": 0.1355782002210617, "logits/rejected": 0.15115031599998474, "logps/accuracies": 0.6875, "logps/chosen": -267.6117858886719, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -278.65167236328125, "logps/ref_rejected": -268.8196105957031, "logps/rejected": -310.05419921875, "loss": 0.655, "rewards/accuracies": 0.875, "rewards/chosen": 1.1039892435073853, "rewards/grad_term": 0.0200988557189703, "rewards/margins": 5.227451801300049, "rewards/rejected": -4.123462677001953, "step": 393 }, { "epoch": 0.8173214054194218, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0, "flips/incorrect->incorrect": 0.5625, "grad_norm": 91.5589589431653, "learning_rate": 6.585928489042676e-07, "logits/chosen": 0.05219127982854843, "logits/rejected": 0.1293550282716751, "logps/accuracies": 0.4375, "logps/chosen": -287.01702880859375, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -297.4679870605469, "logps/ref_rejected": -308.44219970703125, "logps/rejected": -331.50775146484375, "loss": 0.6269, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0450924634933472, "rewards/grad_term": 0.0215632114559412, "rewards/margins": 3.3516530990600586, "rewards/rejected": -2.306560516357422, "step": 394 }, { "epoch": 0.819395825230131, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 80.60637705363116, "learning_rate": 6.57439446366782e-07, "logits/chosen": -0.04064434394240379, "logits/rejected": -0.011088773608207703, "logps/accuracies": 0.75, "logps/chosen": -245.4797821044922, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -250.2657928466797, "logps/ref_rejected": -278.16424560546875, "logps/rejected": -307.46978759765625, "loss": 0.6575, "rewards/accuracies": 0.875, "rewards/chosen": 0.4786025285720825, "rewards/grad_term": 0.023240692913532257, "rewards/margins": 3.4091572761535645, "rewards/rejected": -2.9305543899536133, "step": 395 }, { "epoch": 0.8214702450408401, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.375, "grad_norm": 37.32684776835478, "learning_rate": 6.562860438292964e-07, "logits/chosen": 0.10297183692455292, "logits/rejected": 0.11840492486953735, "logps/accuracies": 0.625, "logps/chosen": -298.5022888183594, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -309.21044921875, "logps/ref_rejected": -305.84539794921875, "logps/rejected": -328.775390625, "loss": 0.6662, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0708208084106445, "rewards/grad_term": 0.022083457559347153, "rewards/margins": 3.363819122314453, "rewards/rejected": -2.2929983139038086, "step": 396 }, { "epoch": 0.8235446648515493, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 23.575297499080293, "learning_rate": 6.551326412918108e-07, "logits/chosen": 0.11109241843223572, "logits/rejected": 0.11409325897693634, "logps/accuracies": 0.9375, "logps/chosen": -276.38092041015625, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -285.5079345703125, "logps/ref_rejected": -288.4066162109375, "logps/rejected": -335.4505920410156, "loss": 0.5911, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9127010107040405, "rewards/grad_term": 0.014403178356587887, "rewards/margins": 5.617100715637207, "rewards/rejected": -4.704399585723877, "step": 397 }, { "epoch": 0.8256190846622585, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 46.15593222404285, "learning_rate": 6.539792387543253e-07, "logits/chosen": 0.09000806510448456, "logits/rejected": 0.09876266866922379, "logps/accuracies": 0.75, "logps/chosen": -270.1310119628906, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -274.2320556640625, "logps/ref_rejected": -265.1705322265625, "logps/rejected": -315.0960693359375, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.41010797023773193, "rewards/grad_term": 0.010970378294587135, "rewards/margins": 5.402661323547363, "rewards/rejected": -4.992552757263184, "step": 398 }, { "epoch": 0.8276935044729677, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 44.74586930025993, "learning_rate": 6.528258362168396e-07, "logits/chosen": 0.26735085248947144, "logits/rejected": 0.30994755029678345, "logps/accuracies": 0.875, "logps/chosen": -246.11720275878906, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -253.5211181640625, "logps/ref_rejected": -260.9595031738281, "logps/rejected": -306.35498046875, "loss": 0.5491, "rewards/accuracies": 1.0, "rewards/chosen": 0.7403918504714966, "rewards/grad_term": 0.01072466466575861, "rewards/margins": 5.279941558837891, "rewards/rejected": -4.539549827575684, "step": 399 }, { "epoch": 0.829767924283677, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.1875, "grad_norm": 52.75963999398655, "learning_rate": 6.516724336793541e-07, "logits/chosen": 0.40581169724464417, "logits/rejected": 0.43723931908607483, "logps/accuracies": 0.8125, "logps/chosen": -302.2900085449219, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -299.8786926269531, "logps/ref_rejected": -311.3181457519531, "logps/rejected": -364.59326171875, "loss": 0.5293, "rewards/accuracies": 0.875, "rewards/chosen": -0.24113261699676514, "rewards/grad_term": 0.017243320122361183, "rewards/margins": 5.086377143859863, "rewards/rejected": -5.32750940322876, "step": 400 }, { "epoch": 0.8318423440943861, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 75.29899473684678, "learning_rate": 6.505190311418684e-07, "logits/chosen": -0.09569695591926575, "logits/rejected": -0.0767926424741745, "logps/accuracies": 0.75, "logps/chosen": -301.2300109863281, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -292.5848388671875, "logps/ref_rejected": -290.6342468261719, "logps/rejected": -361.4559326171875, "loss": 0.5411, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8645120859146118, "rewards/grad_term": 0.014016557484865189, "rewards/margins": 6.21765661239624, "rewards/rejected": -7.0821685791015625, "step": 401 }, { "epoch": 0.8339167639050953, "flips/correct->correct": 0.1875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.625, "flips/incorrect->incorrect": 0.1875, "grad_norm": 16.000137065905, "learning_rate": 6.493656286043829e-07, "logits/chosen": 0.13318368792533875, "logits/rejected": 0.1401294469833374, "logps/accuracies": 0.8125, "logps/chosen": -306.74395751953125, "logps/ref_accuracies": 0.1875, "logps/ref_chosen": -315.7019958496094, "logps/ref_rejected": -279.975341796875, "logps/rejected": -362.96795654296875, "loss": 0.5658, "rewards/accuracies": 1.0, "rewards/chosen": 0.8958021998405457, "rewards/grad_term": 0.0026785405352711678, "rewards/margins": 9.195062637329102, "rewards/rejected": -8.299260139465332, "step": 402 }, { "epoch": 0.8359911837158045, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 39.13628890660203, "learning_rate": 6.482122260668973e-07, "logits/chosen": 0.4807916283607483, "logits/rejected": 0.6536089181900024, "logps/accuracies": 0.8125, "logps/chosen": -300.46356201171875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -272.149658203125, "logps/ref_rejected": -331.09649658203125, "logps/rejected": -420.2705078125, "loss": 0.5903, "rewards/accuracies": 0.9375, "rewards/chosen": -2.831390142440796, "rewards/grad_term": 0.010690869763493538, "rewards/margins": 6.086010932922363, "rewards/rejected": -8.917401313781738, "step": 403 }, { "epoch": 0.8380656035265137, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 30.654540085811046, "learning_rate": 6.470588235294117e-07, "logits/chosen": 0.22139021754264832, "logits/rejected": 0.2569182515144348, "logps/accuracies": 0.75, "logps/chosen": -349.45458984375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -313.68267822265625, "logps/ref_rejected": -315.2803955078125, "logps/rejected": -405.5434265136719, "loss": 0.6411, "rewards/accuracies": 1.0, "rewards/chosen": -3.5771892070770264, "rewards/grad_term": 0.005928752478212118, "rewards/margins": 5.449113368988037, "rewards/rejected": -9.0263032913208, "step": 404 }, { "epoch": 0.8401400233372228, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 45.0761906012067, "learning_rate": 6.459054209919261e-07, "logits/chosen": 0.35096606612205505, "logits/rejected": 0.44806602597236633, "logps/accuracies": 0.8125, "logps/chosen": -223.75851440429688, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -213.4886474609375, "logps/ref_rejected": -226.67689514160156, "logps/rejected": -300.3026428222656, "loss": 0.7055, "rewards/accuracies": 1.0, "rewards/chosen": -1.026986002922058, "rewards/grad_term": 0.006491546984761953, "rewards/margins": 6.3355865478515625, "rewards/rejected": -7.36257266998291, "step": 405 }, { "epoch": 0.842214443147932, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 26.11429132364704, "learning_rate": 6.447520184544407e-07, "logits/chosen": 0.051110029220581055, "logits/rejected": 0.10619683563709259, "logps/accuracies": 0.875, "logps/chosen": -313.3158874511719, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -292.2518310546875, "logps/ref_rejected": -291.42193603515625, "logps/rejected": -382.94781494140625, "loss": 0.7203, "rewards/accuracies": 1.0, "rewards/chosen": -2.1064045429229736, "rewards/grad_term": 0.006023161578923464, "rewards/margins": 7.046186447143555, "rewards/rejected": -9.15259075164795, "step": 406 }, { "epoch": 0.8442888629586413, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 49.70258596298242, "learning_rate": 6.43598615916955e-07, "logits/chosen": 0.2751619219779968, "logits/rejected": 0.2619101107120514, "logps/accuracies": 0.875, "logps/chosen": -298.23443603515625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -270.52728271484375, "logps/ref_rejected": -262.9530029296875, "logps/rejected": -359.8721618652344, "loss": 0.6965, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7707149982452393, "rewards/grad_term": 0.008775541558861732, "rewards/margins": 6.921198844909668, "rewards/rejected": -9.691913604736328, "step": 407 }, { "epoch": 0.8463632827693505, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.375, "grad_norm": 76.86823523264724, "learning_rate": 6.424452133794695e-07, "logits/chosen": 0.10307708382606506, "logits/rejected": 0.0942949503660202, "logps/accuracies": 0.625, "logps/chosen": -341.54669189453125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -313.2756652832031, "logps/ref_rejected": -308.5162658691406, "logps/rejected": -385.50421142578125, "loss": 0.7027, "rewards/accuracies": 0.9375, "rewards/chosen": -2.827104330062866, "rewards/grad_term": 0.01655164361000061, "rewards/margins": 4.871689796447754, "rewards/rejected": -7.698794364929199, "step": 408 }, { "epoch": 0.8484377025800597, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 59.950272206470274, "learning_rate": 6.412918108419838e-07, "logits/chosen": 0.019011177122592926, "logits/rejected": 0.09038500487804413, "logps/accuracies": 0.9375, "logps/chosen": -308.01776123046875, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -263.8279113769531, "logps/ref_rejected": -277.70489501953125, "logps/rejected": -373.0634765625, "loss": 0.6458, "rewards/accuracies": 0.9375, "rewards/chosen": -4.418985366821289, "rewards/grad_term": 0.01191724929958582, "rewards/margins": 5.116873741149902, "rewards/rejected": -9.535858154296875, "step": 409 }, { "epoch": 0.8505121223907688, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 60.54011532087722, "learning_rate": 6.401384083044983e-07, "logits/chosen": 0.105913445353508, "logits/rejected": 0.05668123438954353, "logps/accuracies": 0.9375, "logps/chosen": -325.8265380859375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -328.54022216796875, "logps/ref_rejected": -309.3156433105469, "logps/rejected": -414.94476318359375, "loss": 0.5845, "rewards/accuracies": 1.0, "rewards/chosen": 0.27136293053627014, "rewards/grad_term": 6.003907765261829e-05, "rewards/margins": 10.834280014038086, "rewards/rejected": -10.56291675567627, "step": 410 }, { "epoch": 0.852586542201478, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 47.192528321568695, "learning_rate": 6.389850057670127e-07, "logits/chosen": 0.24323594570159912, "logits/rejected": 0.2931632995605469, "logps/accuracies": 0.75, "logps/chosen": -285.09918212890625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -271.9576721191406, "logps/ref_rejected": -269.5434875488281, "logps/rejected": -350.0928955078125, "loss": 0.6445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3141498565673828, "rewards/grad_term": 0.012051810510456562, "rewards/margins": 6.740789413452148, "rewards/rejected": -8.054939270019531, "step": 411 }, { "epoch": 0.8546609620121872, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 58.581221597188936, "learning_rate": 6.378316032295271e-07, "logits/chosen": -0.11936801671981812, "logits/rejected": -0.12492658197879791, "logps/accuracies": 0.9375, "logps/chosen": -338.5908508300781, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -339.50506591796875, "logps/ref_rejected": -328.51324462890625, "logps/rejected": -415.5992126464844, "loss": 0.5409, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0914229154586792, "rewards/grad_term": 0.006874611601233482, "rewards/margins": 8.800016403198242, "rewards/rejected": -8.708593368530273, "step": 412 }, { "epoch": 0.8567353818228964, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 31.837832566947164, "learning_rate": 6.366782006920415e-07, "logits/chosen": 0.43914633989334106, "logits/rejected": 0.5609852075576782, "logps/accuracies": 0.875, "logps/chosen": -258.90252685546875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -257.27178955078125, "logps/ref_rejected": -292.46502685546875, "logps/rejected": -361.5235900878906, "loss": 0.5556, "rewards/accuracies": 0.875, "rewards/chosen": -0.16307339072227478, "rewards/grad_term": 0.015529593452811241, "rewards/margins": 6.742788791656494, "rewards/rejected": -6.905861854553223, "step": 413 }, { "epoch": 0.8588098016336057, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 130.10385493391817, "learning_rate": 6.355247981545559e-07, "logits/chosen": 0.33855926990509033, "logits/rejected": 0.37952950596809387, "logps/accuracies": 0.875, "logps/chosen": -364.90386962890625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -364.53131103515625, "logps/ref_rejected": -361.88043212890625, "logps/rejected": -423.7973937988281, "loss": 0.5026, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03725364804267883, "rewards/grad_term": 0.014217305928468704, "rewards/margins": 6.1544389724731445, "rewards/rejected": -6.191693305969238, "step": 414 }, { "epoch": 0.8608842214443148, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.0625, "flips/incorrect->incorrect": 0.3125, "grad_norm": 72.11770229856765, "learning_rate": 6.343713956170703e-07, "logits/chosen": 0.40989670157432556, "logits/rejected": 0.4901154935359955, "logps/accuracies": 0.6875, "logps/chosen": -218.7454833984375, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -221.65101623535156, "logps/ref_rejected": -227.98141479492188, "logps/rejected": -275.6513366699219, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": 0.29055318236351013, "rewards/grad_term": 0.013483730144798756, "rewards/margins": 5.057545185089111, "rewards/rejected": -4.766992092132568, "step": 415 }, { "epoch": 0.862958641255024, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 51.89358785836724, "learning_rate": 6.332179930795848e-07, "logits/chosen": -0.05566471815109253, "logits/rejected": 0.030616842210292816, "logps/accuracies": 0.6875, "logps/chosen": -316.4223327636719, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -327.1333312988281, "logps/ref_rejected": -320.0599670410156, "logps/rejected": -366.65240478515625, "loss": 0.6136, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0711019039154053, "rewards/grad_term": 0.016874371096491814, "rewards/margins": 5.730344295501709, "rewards/rejected": -4.659242153167725, "step": 416 }, { "epoch": 0.8650330610657332, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 38.31866291679492, "learning_rate": 6.320645905420991e-07, "logits/chosen": 0.1870647817850113, "logits/rejected": 0.18267706036567688, "logps/accuracies": 0.75, "logps/chosen": -360.9079895019531, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -376.5132751464844, "logps/ref_rejected": -365.2701721191406, "logps/rejected": -400.93572998046875, "loss": 0.591, "rewards/accuracies": 1.0, "rewards/chosen": 1.56052565574646, "rewards/grad_term": 0.011596291325986385, "rewards/margins": 5.127077579498291, "rewards/rejected": -3.5665524005889893, "step": 417 }, { "epoch": 0.8671074808764424, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 92.98916391894235, "learning_rate": 6.309111880046136e-07, "logits/chosen": 0.03672199696302414, "logits/rejected": 0.061092860996723175, "logps/accuracies": 0.8125, "logps/chosen": -270.1746520996094, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -281.4916076660156, "logps/ref_rejected": -294.692626953125, "logps/rejected": -329.16668701171875, "loss": 0.6474, "rewards/accuracies": 1.0, "rewards/chosen": 1.1316967010498047, "rewards/grad_term": 0.014774792827665806, "rewards/margins": 4.579105377197266, "rewards/rejected": -3.447408437728882, "step": 418 }, { "epoch": 0.8691819006871515, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 23.063566639192924, "learning_rate": 6.29757785467128e-07, "logits/chosen": 0.3395993113517761, "logits/rejected": 0.38536539673805237, "logps/accuracies": 0.8125, "logps/chosen": -286.3059997558594, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -296.46038818359375, "logps/ref_rejected": -286.7039489746094, "logps/rejected": -349.8249206542969, "loss": 0.6349, "rewards/accuracies": 0.875, "rewards/chosen": 1.0154372453689575, "rewards/grad_term": 0.009252113290131092, "rewards/margins": 7.3275322914123535, "rewards/rejected": -6.312095642089844, "step": 419 }, { "epoch": 0.8712563204978607, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 44.948117340095266, "learning_rate": 6.286043829296425e-07, "logits/chosen": -0.0751362144947052, "logits/rejected": -0.003658019006252289, "logps/accuracies": 0.75, "logps/chosen": -268.517333984375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -265.34674072265625, "logps/ref_rejected": -274.8442077636719, "logps/rejected": -334.3121643066406, "loss": 0.6457, "rewards/accuracies": 0.875, "rewards/chosen": -0.3170568645000458, "rewards/grad_term": 0.017749693244695663, "rewards/margins": 5.629739284515381, "rewards/rejected": -5.94679594039917, "step": 420 }, { "epoch": 0.8733307403085699, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 34.9886652815262, "learning_rate": 6.274509803921569e-07, "logits/chosen": 0.3326599597930908, "logits/rejected": 0.3828299343585968, "logps/accuracies": 0.875, "logps/chosen": -316.2264404296875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -327.22039794921875, "logps/ref_rejected": -343.5686950683594, "logps/rejected": -394.07080078125, "loss": 0.5868, "rewards/accuracies": 0.875, "rewards/chosen": 1.0993949174880981, "rewards/grad_term": 0.012708110734820366, "rewards/margins": 6.149601936340332, "rewards/rejected": -5.050206661224365, "step": 421 }, { "epoch": 0.8754051601192792, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 17.44095726007292, "learning_rate": 6.262975778546713e-07, "logits/chosen": 0.06078142672777176, "logits/rejected": -0.015550296753644943, "logps/accuracies": 0.9375, "logps/chosen": -322.6217346191406, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -336.0628356933594, "logps/ref_rejected": -319.92767333984375, "logps/rejected": -387.3755798339844, "loss": 0.5532, "rewards/accuracies": 1.0, "rewards/chosen": 1.344112515449524, "rewards/grad_term": 0.007527807727456093, "rewards/margins": 8.088907241821289, "rewards/rejected": -6.7447943687438965, "step": 422 }, { "epoch": 0.8774795799299884, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.625, "flips/incorrect->incorrect": 0.0, "grad_norm": 65.08075817623737, "learning_rate": 6.251441753171857e-07, "logits/chosen": 0.06982388347387314, "logits/rejected": 0.007165290415287018, "logps/accuracies": 1.0, "logps/chosen": -300.95513916015625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -306.9526062011719, "logps/ref_rejected": -288.5630798339844, "logps/rejected": -370.71649169921875, "loss": 0.5573, "rewards/accuracies": 0.875, "rewards/chosen": 0.5997495651245117, "rewards/grad_term": 0.006430043373256922, "rewards/margins": 8.815089225769043, "rewards/rejected": -8.215339660644531, "step": 423 }, { "epoch": 0.8795539997406975, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.25, "grad_norm": 24.805163857445233, "learning_rate": 6.239907727797001e-07, "logits/chosen": 0.22340461611747742, "logits/rejected": 0.21236705780029297, "logps/accuracies": 0.75, "logps/chosen": -313.9115295410156, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -315.0428161621094, "logps/ref_rejected": -296.20806884765625, "logps/rejected": -360.9374694824219, "loss": 0.5634, "rewards/accuracies": 1.0, "rewards/chosen": 0.11312799155712128, "rewards/grad_term": 0.00796779990196228, "rewards/margins": 6.586068153381348, "rewards/rejected": -6.472940444946289, "step": 424 }, { "epoch": 0.8816284195514067, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 76.72911496113723, "learning_rate": 6.228373702422145e-07, "logits/chosen": 0.14682908356189728, "logits/rejected": 0.15023761987686157, "logps/accuracies": 0.75, "logps/chosen": -277.3023681640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -261.1825256347656, "logps/ref_rejected": -256.98846435546875, "logps/rejected": -323.0633239746094, "loss": 0.5797, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6119821071624756, "rewards/grad_term": 0.017880305647850037, "rewards/margins": 4.995503902435303, "rewards/rejected": -6.607484817504883, "step": 425 }, { "epoch": 0.8837028393621159, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 59.32877504186551, "learning_rate": 6.21683967704729e-07, "logits/chosen": 0.11907504498958588, "logits/rejected": 0.10917734354734421, "logps/accuracies": 0.8125, "logps/chosen": -280.1270446777344, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -282.9510192871094, "logps/ref_rejected": -269.74761962890625, "logps/rejected": -354.8128662109375, "loss": 0.535, "rewards/accuracies": 0.875, "rewards/chosen": 0.28239575028419495, "rewards/grad_term": 0.011113264597952366, "rewards/margins": 8.788921356201172, "rewards/rejected": -8.506525993347168, "step": 426 }, { "epoch": 0.8857772591728251, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 127.10327511506759, "learning_rate": 6.205305651672433e-07, "logits/chosen": 0.17048220336437225, "logits/rejected": 0.16041475534439087, "logps/accuracies": 0.9375, "logps/chosen": -294.7939758300781, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -294.1711730957031, "logps/ref_rejected": -291.6072082519531, "logps/rejected": -370.58660888671875, "loss": 0.5691, "rewards/accuracies": 1.0, "rewards/chosen": -0.062279731035232544, "rewards/grad_term": 0.0018367553129792213, "rewards/margins": 7.835660934448242, "rewards/rejected": -7.8979411125183105, "step": 427 }, { "epoch": 0.8878516789835343, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 54.660911786291244, "learning_rate": 6.193771626297578e-07, "logits/chosen": 0.036782991141080856, "logits/rejected": 0.06632021814584732, "logps/accuracies": 0.8125, "logps/chosen": -297.9986572265625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -289.60797119140625, "logps/ref_rejected": -254.0615234375, "logps/rejected": -336.6078796386719, "loss": 0.5262, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8390707969665527, "rewards/grad_term": 0.00769506860524416, "rewards/margins": 7.415563583374023, "rewards/rejected": -8.254634857177734, "step": 428 }, { "epoch": 0.8899260987942434, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 50.959026978469176, "learning_rate": 6.182237600922721e-07, "logits/chosen": 0.2548729181289673, "logits/rejected": 0.24063560366630554, "logps/accuracies": 0.9375, "logps/chosen": -358.7782287597656, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -354.93792724609375, "logps/ref_rejected": -339.633544921875, "logps/rejected": -428.53302001953125, "loss": 0.5172, "rewards/accuracies": 1.0, "rewards/chosen": -0.3840335011482239, "rewards/grad_term": 0.003113335929811001, "rewards/margins": 8.505916595458984, "rewards/rejected": -8.8899507522583, "step": 429 }, { "epoch": 0.8920005186049527, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 29.304522529635086, "learning_rate": 6.170703575547866e-07, "logits/chosen": 0.11386538296937943, "logits/rejected": 0.1519451141357422, "logps/accuracies": 0.8125, "logps/chosen": -247.40721130371094, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -234.81825256347656, "logps/ref_rejected": -226.5369110107422, "logps/rejected": -298.44622802734375, "loss": 0.6175, "rewards/accuracies": 0.875, "rewards/chosen": -1.2588945627212524, "rewards/grad_term": 0.014798032119870186, "rewards/margins": 5.93203592300415, "rewards/rejected": -7.1909308433532715, "step": 430 }, { "epoch": 0.8940749384156619, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 33.20014491905227, "learning_rate": 6.159169550173011e-07, "logits/chosen": 0.29805174469947815, "logits/rejected": 0.33232244849205017, "logps/accuracies": 0.8125, "logps/chosen": -336.54693603515625, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -328.70526123046875, "logps/ref_rejected": -330.7543029785156, "logps/rejected": -422.9801330566406, "loss": 0.5606, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7841659784317017, "rewards/grad_term": 0.006449039559811354, "rewards/margins": 8.438421249389648, "rewards/rejected": -9.222586631774902, "step": 431 }, { "epoch": 0.8961493582263711, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 30.864555897425223, "learning_rate": 6.147635524798154e-07, "logits/chosen": 0.12058807164430618, "logits/rejected": 0.13604214787483215, "logps/accuracies": 0.9375, "logps/chosen": -287.39691162109375, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -291.1800842285156, "logps/ref_rejected": -289.5397644042969, "logps/rejected": -376.7043151855469, "loss": 0.5595, "rewards/accuracies": 1.0, "rewards/chosen": 0.37831762433052063, "rewards/grad_term": 0.0001301583251915872, "rewards/margins": 9.094771385192871, "rewards/rejected": -8.716453552246094, "step": 432 }, { "epoch": 0.8982237780370802, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 48.56405217889427, "learning_rate": 6.136101499423299e-07, "logits/chosen": 0.3865184783935547, "logits/rejected": 0.4680458903312683, "logps/accuracies": 0.75, "logps/chosen": -285.0791015625, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -283.8902587890625, "logps/ref_rejected": -293.55670166015625, "logps/rejected": -369.1327209472656, "loss": 0.5467, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1188843846321106, "rewards/grad_term": 0.006384614389389753, "rewards/margins": 7.438718795776367, "rewards/rejected": -7.557602882385254, "step": 433 }, { "epoch": 0.9002981978477894, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 41.55010194963713, "learning_rate": 6.124567474048442e-07, "logits/chosen": 0.37142789363861084, "logits/rejected": 0.4116554856300354, "logps/accuracies": 0.6875, "logps/chosen": -264.35888671875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -255.59439086914062, "logps/ref_rejected": -254.5562744140625, "logps/rejected": -320.098388671875, "loss": 0.5228, "rewards/accuracies": 1.0, "rewards/chosen": -0.8764491081237793, "rewards/grad_term": 0.009346509352326393, "rewards/margins": 5.6777663230896, "rewards/rejected": -6.5542144775390625, "step": 434 }, { "epoch": 0.9023726176584986, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 41.02709525636894, "learning_rate": 6.113033448673587e-07, "logits/chosen": -0.03402477130293846, "logits/rejected": 0.10666719824075699, "logps/accuracies": 0.9375, "logps/chosen": -330.32745361328125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -331.1935729980469, "logps/ref_rejected": -342.39508056640625, "logps/rejected": -409.2434997558594, "loss": 0.5445, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08661463856697083, "rewards/grad_term": 0.011018088087439537, "rewards/margins": 6.771457672119141, "rewards/rejected": -6.684843063354492, "step": 435 }, { "epoch": 0.9044470374692078, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 46.61949118855806, "learning_rate": 6.101499423298731e-07, "logits/chosen": 0.033953070640563965, "logits/rejected": 0.005475502926856279, "logps/accuracies": 0.6875, "logps/chosen": -304.1283874511719, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -310.3152160644531, "logps/ref_rejected": -276.940185546875, "logps/rejected": -349.3773193359375, "loss": 0.5265, "rewards/accuracies": 1.0, "rewards/chosen": 0.6186816692352295, "rewards/grad_term": 0.0051316795870661736, "rewards/margins": 7.862398147583008, "rewards/rejected": -7.243716239929199, "step": 436 }, { "epoch": 0.9065214572799171, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 55.50940525398979, "learning_rate": 6.089965397923875e-07, "logits/chosen": 0.14566786587238312, "logits/rejected": 0.14087940752506256, "logps/accuracies": 0.875, "logps/chosen": -307.6397705078125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -306.0394592285156, "logps/ref_rejected": -290.89068603515625, "logps/rejected": -374.4580078125, "loss": 0.5268, "rewards/accuracies": 1.0, "rewards/chosen": -0.1600308120250702, "rewards/grad_term": 0.003625539131462574, "rewards/margins": 8.196700096130371, "rewards/rejected": -8.356730461120605, "step": 437 }, { "epoch": 0.9085958770906262, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 36.429819834561926, "learning_rate": 6.078431372549019e-07, "logits/chosen": 0.17382624745368958, "logits/rejected": 0.17122478783130646, "logps/accuracies": 0.9375, "logps/chosen": -268.6485595703125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -274.4229736328125, "logps/ref_rejected": -260.6642761230469, "logps/rejected": -344.1253662109375, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 0.5774391889572144, "rewards/grad_term": 0.0027016454841941595, "rewards/margins": 8.923548698425293, "rewards/rejected": -8.346110343933105, "step": 438 }, { "epoch": 0.9106702969013354, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 33.56713984591113, "learning_rate": 6.066897347174163e-07, "logits/chosen": 0.09694240987300873, "logits/rejected": 0.22976186871528625, "logps/accuracies": 0.9375, "logps/chosen": -258.59161376953125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -264.9013977050781, "logps/ref_rejected": -288.00958251953125, "logps/rejected": -353.9190673828125, "loss": 0.5424, "rewards/accuracies": 1.0, "rewards/chosen": 0.6309794187545776, "rewards/grad_term": 0.008919828571379185, "rewards/margins": 7.221925258636475, "rewards/rejected": -6.590945720672607, "step": 439 }, { "epoch": 0.9127447167120446, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.125, "flips/incorrect->incorrect": 0.3125, "grad_norm": 22.84688441463557, "learning_rate": 6.055363321799307e-07, "logits/chosen": -0.12110434472560883, "logits/rejected": -0.06775850802659988, "logps/accuracies": 0.625, "logps/chosen": -252.79736328125, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -241.74078369140625, "logps/ref_rejected": -250.46945190429688, "logps/rejected": -318.03790283203125, "loss": 0.5855, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1056597232818604, "rewards/grad_term": 0.011472761631011963, "rewards/margins": 5.651185989379883, "rewards/rejected": -6.756845951080322, "step": 440 }, { "epoch": 0.9148191365227538, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 37.379916915616754, "learning_rate": 6.043829296424452e-07, "logits/chosen": 0.10848057270050049, "logits/rejected": 0.11391180008649826, "logps/accuracies": 0.8125, "logps/chosen": -311.4754943847656, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -313.4561462402344, "logps/ref_rejected": -308.49609375, "logps/rejected": -389.3127136230469, "loss": 0.5174, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19806542992591858, "rewards/grad_term": 0.007313254754990339, "rewards/margins": 8.279730796813965, "rewards/rejected": -8.0816650390625, "step": 441 }, { "epoch": 0.916893556333463, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.125, "grad_norm": 29.537088633715857, "learning_rate": 6.032295271049595e-07, "logits/chosen": -0.04151641204953194, "logits/rejected": -0.03607035428285599, "logps/accuracies": 0.875, "logps/chosen": -309.4384765625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -318.02105712890625, "logps/ref_rejected": -330.68695068359375, "logps/rejected": -403.2144775390625, "loss": 0.5329, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8582614064216614, "rewards/grad_term": 0.007493661250919104, "rewards/margins": 8.111011505126953, "rewards/rejected": -7.252751350402832, "step": 442 }, { "epoch": 0.9189679761441721, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.25, "grad_norm": 27.777171215232052, "learning_rate": 6.02076124567474e-07, "logits/chosen": 0.2931877374649048, "logits/rejected": 0.2972795367240906, "logps/accuracies": 0.75, "logps/chosen": -301.7099609375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -301.0013122558594, "logps/ref_rejected": -281.2795104980469, "logps/rejected": -338.0366516113281, "loss": 0.564, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07086822390556335, "rewards/grad_term": 0.012654304504394531, "rewards/margins": 5.604846954345703, "rewards/rejected": -5.67571496963501, "step": 443 }, { "epoch": 0.9210423959548814, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 26.705457998840874, "learning_rate": 6.009227220299884e-07, "logits/chosen": 0.24231280386447906, "logits/rejected": 0.25109824538230896, "logps/accuracies": 0.8125, "logps/chosen": -255.10946655273438, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -262.9153747558594, "logps/ref_rejected": -290.76556396484375, "logps/rejected": -346.1953430175781, "loss": 0.5406, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7805902361869812, "rewards/grad_term": 0.00831932295113802, "rewards/margins": 6.32357120513916, "rewards/rejected": -5.542980670928955, "step": 444 }, { "epoch": 0.9231168157655906, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.3125, "grad_norm": 45.863702043985306, "learning_rate": 5.997693194925029e-07, "logits/chosen": 0.30657637119293213, "logits/rejected": 0.40025636553764343, "logps/accuracies": 0.6875, "logps/chosen": -378.9405517578125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -375.9232482910156, "logps/ref_rejected": -406.2908935546875, "logps/rejected": -476.7547912597656, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": -0.30173051357269287, "rewards/grad_term": 0.007800333667546511, "rewards/margins": 6.744661808013916, "rewards/rejected": -7.046392440795898, "step": 445 }, { "epoch": 0.9251912355762998, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.125, "grad_norm": 58.839145890605785, "learning_rate": 5.986159169550173e-07, "logits/chosen": 0.2454492151737213, "logits/rejected": 0.2175568789243698, "logps/accuracies": 0.875, "logps/chosen": -282.2199401855469, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -279.5404968261719, "logps/ref_rejected": -255.2272186279297, "logps/rejected": -328.5665283203125, "loss": 0.4951, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2679447829723358, "rewards/grad_term": 0.009870468638837337, "rewards/margins": 7.065983295440674, "rewards/rejected": -7.333928108215332, "step": 446 }, { "epoch": 0.9272656553870089, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 45.27476270388639, "learning_rate": 5.974625144175317e-07, "logits/chosen": 0.25637272000312805, "logits/rejected": 0.284266859292984, "logps/accuracies": 0.8125, "logps/chosen": -308.41693115234375, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -299.62353515625, "logps/ref_rejected": -297.4239807128906, "logps/rejected": -373.3579406738281, "loss": 0.5676, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8793376088142395, "rewards/grad_term": 0.01210443302989006, "rewards/margins": 6.714059829711914, "rewards/rejected": -7.5933966636657715, "step": 447 }, { "epoch": 0.9293400751977181, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 24.468747270838374, "learning_rate": 5.963091118800461e-07, "logits/chosen": -0.047158196568489075, "logits/rejected": -0.004086131229996681, "logps/accuracies": 0.8125, "logps/chosen": -363.8712158203125, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -358.2686462402344, "logps/ref_rejected": -370.7213439941406, "logps/rejected": -445.895263671875, "loss": 0.5564, "rewards/accuracies": 1.0, "rewards/chosen": -0.5602600574493408, "rewards/grad_term": 0.010198243893682957, "rewards/margins": 6.957134246826172, "rewards/rejected": -7.517394065856934, "step": 448 }, { "epoch": 0.9293400751977181, "eval_flips/correct->correct": 0.4433497488498688, "eval_flips/correct->incorrect": 0.0, "eval_flips/incorrect->correct": 0.37438422441482544, "eval_flips/incorrect->incorrect": 0.1822660118341446, "eval_logits/chosen": 0.12454497069120407, "eval_logits/rejected": 0.16565194725990295, "eval_logps/accuracies": 0.8177340030670166, "eval_logps/chosen": -297.1930847167969, "eval_logps/ref_accuracies": 0.4433497488498688, "eval_logps/ref_chosen": -287.3511047363281, "eval_logps/ref_rejected": -289.0460205078125, "eval_logps/rejected": -360.1458740234375, "eval_loss": 0.5838693976402283, "eval_rewards/accuracies": 0.9113300442695618, "eval_rewards/chosen": -0.9841962456703186, "eval_rewards/grad_term": 0.01190107874572277, "eval_rewards/margins": 6.125789165496826, "eval_rewards/rejected": -7.1099853515625, "eval_runtime": 804.5696, "eval_samples_per_second": 2.011, "eval_steps_per_second": 0.252, "step": 448 }, { "epoch": 0.9314144950084273, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 42.40430094029457, "learning_rate": 5.951557093425605e-07, "logits/chosen": 0.2953071594238281, "logits/rejected": 0.32737353444099426, "logps/accuracies": 0.75, "logps/chosen": -246.87020874023438, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -241.10800170898438, "logps/ref_rejected": -247.88864135742188, "logps/rejected": -314.1338806152344, "loss": 0.5558, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5762210488319397, "rewards/grad_term": 0.016563208773732185, "rewards/margins": 6.0483012199401855, "rewards/rejected": -6.6245222091674805, "step": 449 }, { "epoch": 0.9334889148191365, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 55.56936800158885, "learning_rate": 5.940023068050749e-07, "logits/chosen": -0.24604183435440063, "logits/rejected": -0.20258383452892303, "logps/accuracies": 0.9375, "logps/chosen": -280.2922058105469, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -271.5796813964844, "logps/ref_rejected": -273.0106506347656, "logps/rejected": -346.3686828613281, "loss": 0.5873, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8712505102157593, "rewards/grad_term": 0.010869830846786499, "rewards/margins": 6.4645562171936035, "rewards/rejected": -7.335805892944336, "step": 450 }, { "epoch": 0.9355633346298458, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 104.79125724883293, "learning_rate": 5.928489042675894e-07, "logits/chosen": 0.24167490005493164, "logits/rejected": 0.2851963937282562, "logps/accuracies": 0.9375, "logps/chosen": -315.26214599609375, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -313.26177978515625, "logps/ref_rejected": -285.7829895019531, "logps/rejected": -376.867919921875, "loss": 0.5462, "rewards/accuracies": 1.0, "rewards/chosen": -0.20003750920295715, "rewards/grad_term": 0.0005122160073369741, "rewards/margins": 8.908455848693848, "rewards/rejected": -9.10849380493164, "step": 451 }, { "epoch": 0.9376377544405549, "flips/correct->correct": 0.6875, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.125, "grad_norm": 29.758835328305054, "learning_rate": 5.916955017301037e-07, "logits/chosen": 0.2663220167160034, "logits/rejected": 0.3874686658382416, "logps/accuracies": 0.875, "logps/chosen": -262.48992919921875, "logps/ref_accuracies": 0.6875, "logps/ref_chosen": -257.1368408203125, "logps/ref_rejected": -272.6324462890625, "logps/rejected": -335.6940612792969, "loss": 0.5535, "rewards/accuracies": 1.0, "rewards/chosen": -0.5353102684020996, "rewards/grad_term": 0.008673110976815224, "rewards/margins": 5.770854473114014, "rewards/rejected": -6.3061652183532715, "step": 452 }, { "epoch": 0.9397121742512641, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 17.42639637388175, "learning_rate": 5.905420991926182e-07, "logits/chosen": 0.2760721743106842, "logits/rejected": 0.3189687430858612, "logps/accuracies": 0.75, "logps/chosen": -275.108642578125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -274.26934814453125, "logps/ref_rejected": -267.088134765625, "logps/rejected": -331.2291564941406, "loss": 0.4576, "rewards/accuracies": 0.875, "rewards/chosen": -0.0839340090751648, "rewards/grad_term": 0.01929977536201477, "rewards/margins": 6.330172538757324, "rewards/rejected": -6.414106369018555, "step": 453 }, { "epoch": 0.9417865940619733, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 43.82809699832689, "learning_rate": 5.893886966551325e-07, "logits/chosen": 0.21667756140232086, "logits/rejected": 0.20864138007164001, "logps/accuracies": 0.6875, "logps/chosen": -275.2607727050781, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -276.90924072265625, "logps/ref_rejected": -287.8323974609375, "logps/rejected": -336.0874328613281, "loss": 0.5327, "rewards/accuracies": 0.875, "rewards/chosen": 0.16484564542770386, "rewards/grad_term": 0.018737200647592545, "rewards/margins": 4.990347862243652, "rewards/rejected": -4.825502395629883, "step": 454 }, { "epoch": 0.9438610138726825, "flips/correct->correct": 0.75, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.0625, "grad_norm": 38.23253448586923, "learning_rate": 5.88235294117647e-07, "logits/chosen": 0.11684215813875198, "logits/rejected": 0.24031777679920197, "logps/accuracies": 0.9375, "logps/chosen": -313.59710693359375, "logps/ref_accuracies": 0.75, "logps/ref_chosen": -317.1141052246094, "logps/ref_rejected": -372.68316650390625, "logps/rejected": -439.81866455078125, "loss": 0.5429, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35169944167137146, "rewards/grad_term": 0.00929531641304493, "rewards/margins": 7.065249919891357, "rewards/rejected": -6.713550567626953, "step": 455 }, { "epoch": 0.9459354336833917, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5625, "flips/incorrect->incorrect": 0.0625, "grad_norm": 56.76327899023629, "learning_rate": 5.870818915801614e-07, "logits/chosen": 0.10899796336889267, "logits/rejected": 0.17427489161491394, "logps/accuracies": 0.9375, "logps/chosen": -296.0990295410156, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -301.2550048828125, "logps/ref_rejected": -292.7115173339844, "logps/rejected": -371.61187744140625, "loss": 0.5333, "rewards/accuracies": 0.875, "rewards/chosen": 0.5155962705612183, "rewards/grad_term": 0.00969706755131483, "rewards/margins": 8.405632972717285, "rewards/rejected": -7.890036582946777, "step": 456 }, { "epoch": 0.9480098534941008, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.1875, "grad_norm": 55.72280578194114, "learning_rate": 5.859284890426759e-07, "logits/chosen": -0.003691728226840496, "logits/rejected": 0.0035413503646850586, "logps/accuracies": 0.8125, "logps/chosen": -304.0773010253906, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -317.5519104003906, "logps/ref_rejected": -319.302978515625, "logps/rejected": -378.9814453125, "loss": 0.5376, "rewards/accuracies": 0.875, "rewards/chosen": 1.347464919090271, "rewards/grad_term": 0.011599891819059849, "rewards/margins": 7.315312385559082, "rewards/rejected": -5.9678473472595215, "step": 457 }, { "epoch": 0.95008427330481, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0625, "grad_norm": 63.66588058556034, "learning_rate": 5.847750865051903e-07, "logits/chosen": -0.25413990020751953, "logits/rejected": -0.17748790979385376, "logps/accuracies": 0.9375, "logps/chosen": -318.7485656738281, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -321.310546875, "logps/ref_rejected": -325.7279052734375, "logps/rejected": -377.45843505859375, "loss": 0.5057, "rewards/accuracies": 0.875, "rewards/chosen": 0.2561964988708496, "rewards/grad_term": 0.013258688151836395, "rewards/margins": 5.429249286651611, "rewards/rejected": -5.173052787780762, "step": 458 }, { "epoch": 0.9521586931155193, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.25, "grad_norm": 29.584209944027126, "learning_rate": 5.836216839677048e-07, "logits/chosen": -0.028666552156209946, "logits/rejected": 0.027672436088323593, "logps/accuracies": 0.75, "logps/chosen": -297.3187561035156, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -300.38385009765625, "logps/ref_rejected": -299.2121276855469, "logps/rejected": -364.8226623535156, "loss": 0.464, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3065095543861389, "rewards/grad_term": 0.009040933102369308, "rewards/margins": 6.867563247680664, "rewards/rejected": -6.561053276062012, "step": 459 }, { "epoch": 0.9542331129262285, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.125, "grad_norm": 36.21237401329387, "learning_rate": 5.824682814302191e-07, "logits/chosen": 0.010993116535246372, "logits/rejected": 0.10759762674570084, "logps/accuracies": 0.8125, "logps/chosen": -247.78744506835938, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -243.45068359375, "logps/ref_rejected": -238.94656372070312, "logps/rejected": -306.70025634765625, "loss": 0.5576, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4336736798286438, "rewards/grad_term": 0.010049809701740742, "rewards/margins": 6.341697692871094, "rewards/rejected": -6.775371551513672, "step": 460 }, { "epoch": 0.9563075327369377, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 67.07396455833435, "learning_rate": 5.813148788927336e-07, "logits/chosen": 0.2826724350452423, "logits/rejected": 0.30516886711120605, "logps/accuracies": 0.9375, "logps/chosen": -303.901611328125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -310.4884948730469, "logps/ref_rejected": -321.95538330078125, "logps/rejected": -394.9017639160156, "loss": 0.5389, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6586897373199463, "rewards/grad_term": 0.007842399179935455, "rewards/margins": 7.953330993652344, "rewards/rejected": -7.294641971588135, "step": 461 }, { "epoch": 0.9583819525476468, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.0, "grad_norm": 29.22489915607507, "learning_rate": 5.801614763552479e-07, "logits/chosen": 0.12667125463485718, "logits/rejected": 0.24145105481147766, "logps/accuracies": 1.0, "logps/chosen": -228.75924682617188, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -230.16534423828125, "logps/ref_rejected": -276.22015380859375, "logps/rejected": -355.9960632324219, "loss": 0.5172, "rewards/accuracies": 1.0, "rewards/chosen": 0.14060965180397034, "rewards/grad_term": 0.003057720372453332, "rewards/margins": 8.118200302124023, "rewards/rejected": -7.977591037750244, "step": 462 }, { "epoch": 0.960456372358356, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 46.6463409683984, "learning_rate": 5.790080738177624e-07, "logits/chosen": 0.22849154472351074, "logits/rejected": 0.2621627748012543, "logps/accuracies": 0.875, "logps/chosen": -352.41827392578125, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -345.7464294433594, "logps/ref_rejected": -329.25946044921875, "logps/rejected": -404.5802001953125, "loss": 0.4936, "rewards/accuracies": 0.875, "rewards/chosen": -0.6671811938285828, "rewards/grad_term": 0.010429211892187595, "rewards/margins": 6.864894866943359, "rewards/rejected": -7.532076835632324, "step": 463 }, { "epoch": 0.9625307921690652, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.0625, "grad_norm": 23.56747059673345, "learning_rate": 5.778546712802767e-07, "logits/chosen": 0.07257233560085297, "logits/rejected": 0.10124337673187256, "logps/accuracies": 0.9375, "logps/chosen": -296.21209716796875, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -301.79473876953125, "logps/ref_rejected": -295.86785888671875, "logps/rejected": -370.70001220703125, "loss": 0.5419, "rewards/accuracies": 0.9375, "rewards/chosen": 0.558262288570404, "rewards/grad_term": 0.007962905801832676, "rewards/margins": 8.041479110717773, "rewards/rejected": -7.483217239379883, "step": 464 }, { "epoch": 0.9646052119797744, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 32.887206960447294, "learning_rate": 5.767012687427912e-07, "logits/chosen": -0.02765033021569252, "logits/rejected": -0.04159718379378319, "logps/accuracies": 0.875, "logps/chosen": -301.87750244140625, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -294.6784973144531, "logps/ref_rejected": -315.06695556640625, "logps/rejected": -396.1573486328125, "loss": 0.5015, "rewards/accuracies": 0.875, "rewards/chosen": -0.7198995351791382, "rewards/grad_term": 0.013939508236944675, "rewards/margins": 7.389136791229248, "rewards/rejected": -8.109036445617676, "step": 465 }, { "epoch": 0.9666796317904836, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.5, "flips/incorrect->incorrect": 0.25, "grad_norm": 31.89130120157542, "learning_rate": 5.755478662053056e-07, "logits/chosen": 0.06474259495735168, "logits/rejected": 0.1507914811372757, "logps/accuracies": 0.75, "logps/chosen": -362.8743896484375, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -358.84747314453125, "logps/ref_rejected": -336.5288391113281, "logps/rejected": -405.94775390625, "loss": 0.4804, "rewards/accuracies": 0.875, "rewards/chosen": -0.4026913344860077, "rewards/grad_term": 0.012680365703999996, "rewards/margins": 6.539196014404297, "rewards/rejected": -6.941887378692627, "step": 466 }, { "epoch": 0.9687540516011928, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.0625, "grad_norm": 24.005282908584242, "learning_rate": 5.7439446366782e-07, "logits/chosen": 0.46883174777030945, "logits/rejected": 0.5026016235351562, "logps/accuracies": 0.9375, "logps/chosen": -274.5486145019531, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -275.224853515625, "logps/ref_rejected": -294.5133361816406, "logps/rejected": -363.38482666015625, "loss": 0.5302, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06762228161096573, "rewards/grad_term": 0.0132514713332057, "rewards/margins": 6.954771041870117, "rewards/rejected": -6.887148857116699, "step": 467 }, { "epoch": 0.970828471411902, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.3125, "grad_norm": 93.90250048693143, "learning_rate": 5.732410611303344e-07, "logits/chosen": 0.13799475133419037, "logits/rejected": 0.10960017144680023, "logps/accuracies": 0.6875, "logps/chosen": -306.63336181640625, "logps/ref_accuracies": 0.5, "logps/ref_chosen": -298.33892822265625, "logps/ref_rejected": -294.1246032714844, "logps/rejected": -364.4803466796875, "loss": 0.558, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8294415473937988, "rewards/grad_term": 0.011318499222397804, "rewards/margins": 6.206131458282471, "rewards/rejected": -7.0355730056762695, "step": 468 }, { "epoch": 0.9729028912226112, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 33.22451982924039, "learning_rate": 5.72087658592849e-07, "logits/chosen": 0.2051057368516922, "logits/rejected": 0.39599326252937317, "logps/accuracies": 0.875, "logps/chosen": -297.212890625, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -296.35797119140625, "logps/ref_rejected": -341.98760986328125, "logps/rejected": -415.61602783203125, "loss": 0.577, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08549117296934128, "rewards/grad_term": 0.008653431199491024, "rewards/margins": 7.277350902557373, "rewards/rejected": -7.362841606140137, "step": 469 }, { "epoch": 0.9749773110333204, "flips/correct->correct": 0.5, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 55.27297145431258, "learning_rate": 5.709342560553633e-07, "logits/chosen": 0.406170129776001, "logits/rejected": 0.4695666432380676, "logps/accuracies": 0.75, "logps/chosen": -267.87518310546875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -264.7390441894531, "logps/ref_rejected": -286.3910217285156, "logps/rejected": -358.07012939453125, "loss": 0.4829, "rewards/accuracies": 0.875, "rewards/chosen": -0.3136145770549774, "rewards/grad_term": 0.010346844792366028, "rewards/margins": 6.854294776916504, "rewards/rejected": -7.167908668518066, "step": 470 }, { "epoch": 0.9770517308440295, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.5, "grad_norm": 46.76169337104463, "learning_rate": 5.697808535178778e-07, "logits/chosen": 0.021602880209684372, "logits/rejected": 0.10054953396320343, "logps/accuracies": 0.4375, "logps/chosen": -277.87420654296875, "logps/ref_accuracies": 0.3125, "logps/ref_chosen": -278.6906433105469, "logps/ref_rejected": -260.7660827636719, "logps/rejected": -313.3102722167969, "loss": 0.523, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08164243400096893, "rewards/grad_term": 0.015251345932483673, "rewards/margins": 5.336061954498291, "rewards/rejected": -5.254419326782227, "step": 471 }, { "epoch": 0.9791261506547387, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.3125, "flips/incorrect->incorrect": 0.125, "grad_norm": 22.784427030938893, "learning_rate": 5.686274509803921e-07, "logits/chosen": 0.40202221274375916, "logits/rejected": 0.47440534830093384, "logps/accuracies": 0.875, "logps/chosen": -262.98046875, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -269.6217956542969, "logps/ref_rejected": -294.6548767089844, "logps/rejected": -360.1872253417969, "loss": 0.5311, "rewards/accuracies": 0.875, "rewards/chosen": 0.6641333103179932, "rewards/grad_term": 0.008491192013025284, "rewards/margins": 7.217367172241211, "rewards/rejected": -6.553234100341797, "step": 472 }, { "epoch": 0.981200570465448, "flips/correct->correct": 0.25, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.3125, "grad_norm": 17.893894514454058, "learning_rate": 5.674740484429066e-07, "logits/chosen": 0.2951053977012634, "logits/rejected": 0.31693655252456665, "logps/accuracies": 0.6875, "logps/chosen": -262.5206298828125, "logps/ref_accuracies": 0.25, "logps/ref_chosen": -264.88336181640625, "logps/ref_rejected": -253.90045166015625, "logps/rejected": -300.7421875, "loss": 0.5814, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23627310991287231, "rewards/grad_term": 0.016665775328874588, "rewards/margins": 4.920448303222656, "rewards/rejected": -4.684175491333008, "step": 473 }, { "epoch": 0.9832749902761572, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 43.55914775192543, "learning_rate": 5.66320645905421e-07, "logits/chosen": -0.024594342336058617, "logits/rejected": 0.07602076232433319, "logps/accuracies": 0.75, "logps/chosen": -239.41683959960938, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -248.70411682128906, "logps/ref_rejected": -289.31231689453125, "logps/rejected": -342.78070068359375, "loss": 0.6143, "rewards/accuracies": 1.0, "rewards/chosen": 0.9287264347076416, "rewards/grad_term": 0.012531589716672897, "rewards/margins": 6.275561332702637, "rewards/rejected": -5.346835136413574, "step": 474 }, { "epoch": 0.9853494100868664, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.1875, "grad_norm": 62.70114024432645, "learning_rate": 5.651672433679354e-07, "logits/chosen": 0.10585808008909225, "logits/rejected": 0.11864355206489563, "logps/accuracies": 0.8125, "logps/chosen": -296.0154724121094, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -307.2314453125, "logps/ref_rejected": -307.9908752441406, "logps/rejected": -372.58465576171875, "loss": 0.5211, "rewards/accuracies": 0.875, "rewards/chosen": 1.1215964555740356, "rewards/grad_term": 0.0117443036288023, "rewards/margins": 7.5809736251831055, "rewards/rejected": -6.459376811981201, "step": 475 }, { "epoch": 0.9874238298975755, "flips/correct->correct": 0.375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.1875, "grad_norm": 49.433246174946895, "learning_rate": 5.640138408304498e-07, "logits/chosen": -0.0011881794780492783, "logits/rejected": 0.056654639542102814, "logps/accuracies": 0.8125, "logps/chosen": -287.03497314453125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -299.6839904785156, "logps/ref_rejected": -300.8826904296875, "logps/rejected": -363.1770935058594, "loss": 0.5397, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2649037837982178, "rewards/grad_term": 0.011040883138775826, "rewards/margins": 7.494347095489502, "rewards/rejected": -6.229443550109863, "step": 476 }, { "epoch": 0.9894982497082847, "flips/correct->correct": 0.5625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.1875, "flips/incorrect->incorrect": 0.25, "grad_norm": 37.13699226629539, "learning_rate": 5.628604382929642e-07, "logits/chosen": 0.22437655925750732, "logits/rejected": 0.2689896821975708, "logps/accuracies": 0.75, "logps/chosen": -257.7532958984375, "logps/ref_accuracies": 0.5625, "logps/ref_chosen": -264.4325256347656, "logps/ref_rejected": -272.7551574707031, "logps/rejected": -302.99462890625, "loss": 0.56, "rewards/accuracies": 0.75, "rewards/chosen": 0.667922854423523, "rewards/grad_term": 0.02357163466513157, "rewards/margins": 3.6918697357177734, "rewards/rejected": -3.02394700050354, "step": 477 }, { "epoch": 0.9915726695189939, "flips/correct->correct": 0.4375, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.4375, "flips/incorrect->incorrect": 0.125, "grad_norm": 58.00284238737268, "learning_rate": 5.617070357554786e-07, "logits/chosen": 0.23218779265880585, "logits/rejected": 0.20929032564163208, "logps/accuracies": 0.875, "logps/chosen": -320.3446350097656, "logps/ref_accuracies": 0.4375, "logps/ref_chosen": -321.2319641113281, "logps/ref_rejected": -327.994140625, "logps/rejected": -400.4293212890625, "loss": 0.5079, "rewards/accuracies": 0.875, "rewards/chosen": 0.08873284608125687, "rewards/grad_term": 0.011475574225187302, "rewards/margins": 7.332255840301514, "rewards/rejected": -7.243522644042969, "step": 478 }, { "epoch": 0.9936470893297031, "flips/correct->correct": 0.3125, "flips/correct->incorrect": 0.0625, "flips/incorrect->correct": 0.375, "flips/incorrect->incorrect": 0.25, "grad_norm": 30.26855049657639, "learning_rate": 5.605536332179931e-07, "logits/chosen": 0.20646262168884277, "logits/rejected": 0.18982850015163422, "logps/accuracies": 0.6875, "logps/chosen": -338.36700439453125, "logps/ref_accuracies": 0.375, "logps/ref_chosen": -340.2660217285156, "logps/ref_rejected": -332.7535705566406, "logps/rejected": -394.439697265625, "loss": 0.535, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1898985505104065, "rewards/grad_term": 0.012958088889718056, "rewards/margins": 6.358510971069336, "rewards/rejected": -6.168612957000732, "step": 479 }, { "epoch": 0.9957215091404122, "flips/correct->correct": 0.625, "flips/correct->incorrect": 0.0, "flips/incorrect->correct": 0.25, "flips/incorrect->incorrect": 0.125, "grad_norm": 19.142226432349567, "learning_rate": 5.594002306805074e-07, "logits/chosen": 0.26075422763824463, "logits/rejected": 0.3115725517272949, "logps/accuracies": 0.875, "logps/chosen": -263.19952392578125, "logps/ref_accuracies": 0.625, "logps/ref_chosen": -252.8601531982422, "logps/ref_rejected": -255.4573516845703, "logps/rejected": -332.5611267089844, "loss": 0.5524, "rewards/accuracies": 0.8125, "rewards/chosen": -1.033937931060791, "rewards/grad_term": 0.014926022849977016, "rewards/margins": 6.676440715789795, "rewards/rejected": -7.710378170013428, "step": 480 } ], "logging_steps": 1, "max_steps": 964, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 96, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }