diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11728 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9957215091404122, + "eval_steps": 64, + "global_step": 480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002074419810709192, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5625, + "grad_norm": 30.445291710986226, + "learning_rate": 0.0, + "logits/chosen": 1.3143655061721802, + "logits/rejected": 1.334812045097351, + "logps/accuracies": 0.4375, + "logps/chosen": -329.3199157714844, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -329.3199157714844, + "logps/ref_rejected": -308.284912109375, + "logps/rejected": -308.284912109375, + "loss": 1.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/grad_term": 0.05000000447034836, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.004148839621418384, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 31.539643174909184, + "learning_rate": 1.5151715240963886e-07, + "logits/chosen": 1.136220932006836, + "logits/rejected": 1.1561778783798218, + "logps/accuracies": 0.5625, + "logps/chosen": -280.4060363769531, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -280.13531494140625, + "logps/ref_rejected": -287.2406005859375, + "logps/rejected": -287.34637451171875, + "loss": 0.9925, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.027070851996541023, + "rewards/grad_term": 0.05042332783341408, + "rewards/margins": -0.01649157702922821, + "rewards/rejected": -0.010579276829957962, + "step": 2 + }, + { + "epoch": 0.006223259432127577, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.6875, + "grad_norm": 35.669754591330424, + "learning_rate": 2.401490047853298e-07, + "logits/chosen": 1.6139692068099976, + "logits/rejected": 1.5537246465682983, + "logps/accuracies": 0.3125, + "logps/chosen": -279.83502197265625, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -279.268310546875, + "logps/ref_rejected": -258.850341796875, + "logps/rejected": -259.1755065917969, + "loss": 0.9854, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.056671928614377975, + "rewards/grad_term": 0.050606753677129745, + "rewards/margins": -0.024156270548701286, + "rewards/rejected": -0.03251565620303154, + "step": 3 + }, + { + "epoch": 0.008297679242836769, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5625, + "grad_norm": 35.57252658535935, + "learning_rate": 3.030343048192777e-07, + "logits/chosen": 1.7025470733642578, + "logits/rejected": 1.6247684955596924, + "logps/accuracies": 0.4375, + "logps/chosen": -321.3578796386719, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -321.1121826171875, + "logps/ref_rejected": -316.1632995605469, + "logps/rejected": -316.3089599609375, + "loss": 0.9885, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.024570418521761894, + "rewards/grad_term": 0.050244007259607315, + "rewards/margins": -0.010007334873080254, + "rewards/rejected": -0.014563081786036491, + "step": 4 + }, + { + "epoch": 0.010372099053545962, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 40.58061807061268, + "learning_rate": 3.5181193303727093e-07, + "logits/chosen": 1.4226707220077515, + "logits/rejected": 1.505796194076538, + "logps/accuracies": 0.625, + "logps/chosen": -245.84014892578125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -245.4801483154297, + "logps/ref_rejected": -251.10232543945312, + "logps/rejected": -251.73736572265625, + "loss": 0.9812, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03600040823221207, + "rewards/grad_term": 0.049336254596710205, + "rewards/margins": 0.02750583179295063, + "rewards/rejected": -0.06350623816251755, + "step": 5 + }, + { + "epoch": 0.012446518864255154, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 27.99149170411343, + "learning_rate": 3.9166615719496866e-07, + "logits/chosen": 1.3876930475234985, + "logits/rejected": 1.4264953136444092, + "logps/accuracies": 0.6875, + "logps/chosen": -291.3964538574219, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -291.29791259765625, + "logps/ref_rejected": -307.63433837890625, + "logps/rejected": -309.51678466796875, + "loss": 0.9803, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009854471310973167, + "rewards/grad_term": 0.04629334807395935, + "rewards/margins": 0.17839080095291138, + "rewards/rejected": -0.1882452815771103, + "step": 6 + }, + { + "epoch": 0.014520938674964345, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.625, + "grad_norm": 35.07512813691069, + "learning_rate": 4.253624235933518e-07, + "logits/chosen": 1.3303760290145874, + "logits/rejected": 1.41872239112854, + "logps/accuracies": 0.375, + "logps/chosen": -256.7828369140625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -256.71258544921875, + "logps/ref_rejected": -252.4455108642578, + "logps/rejected": -255.78366088867188, + "loss": 0.9451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.007027318701148033, + "rewards/grad_term": 0.04277125000953674, + "rewards/margins": 0.3267865777015686, + "rewards/rejected": -0.3338139057159424, + "step": 7 + }, + { + "epoch": 0.016595358485673537, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 53.4690693475631, + "learning_rate": 4.545514572289166e-07, + "logits/chosen": 1.4601320028305054, + "logits/rejected": 1.5025103092193604, + "logps/accuracies": 0.25, + "logps/chosen": -328.91387939453125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -328.650634765625, + "logps/ref_rejected": -320.340576171875, + "logps/rejected": -322.7126159667969, + "loss": 0.9334, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.026328086853027344, + "rewards/grad_term": 0.04583045467734337, + "rewards/margins": 0.2108786404132843, + "rewards/rejected": -0.23720674216747284, + "step": 8 + }, + { + "epoch": 0.01866977829638273, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 50.206274648941296, + "learning_rate": 4.802980095706596e-07, + "logits/chosen": 1.516817331314087, + "logits/rejected": 1.5115327835083008, + "logps/accuracies": 0.625, + "logps/chosen": -271.8262634277344, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -272.4455261230469, + "logps/ref_rejected": -264.29541015625, + "logps/rejected": -272.98419189453125, + "loss": 0.9138, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0619237944483757, + "rewards/grad_term": 0.037300530821084976, + "rewards/margins": 0.930805504322052, + "rewards/rejected": -0.8688817620277405, + "step": 9 + }, + { + "epoch": 0.020744198107091924, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 68.87775211865134, + "learning_rate": 5.033290854469099e-07, + "logits/chosen": 1.1356251239776611, + "logits/rejected": 1.1563141345977783, + "logps/accuracies": 0.5625, + "logps/chosen": -303.23114013671875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -303.0604553222656, + "logps/ref_rejected": -303.90673828125, + "logps/rejected": -306.86865234375, + "loss": 0.9189, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.017069505527615547, + "rewards/grad_term": 0.04398500546813011, + "rewards/margins": 0.27912360429763794, + "rewards/rejected": -0.29619312286376953, + "step": 10 + }, + { + "epoch": 0.022818617917801116, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 57.71463843312805, + "learning_rate": 5.241632278117911e-07, + "logits/chosen": 1.4097551107406616, + "logits/rejected": 1.5242267847061157, + "logps/accuracies": 0.6875, + "logps/chosen": -328.52459716796875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -328.664306640625, + "logps/ref_rejected": -354.8746032714844, + "logps/rejected": -370.66998291015625, + "loss": 0.874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.013973474502563477, + "rewards/grad_term": 0.029512763023376465, + "rewards/margins": 1.5935115814208984, + "rewards/rejected": -1.579538106918335, + "step": 11 + }, + { + "epoch": 0.024893037728510307, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 19.34405027942017, + "learning_rate": 5.431833096046075e-07, + "logits/chosen": 1.2825186252593994, + "logits/rejected": 1.4041016101837158, + "logps/accuracies": 0.6875, + "logps/chosen": -339.60736083984375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -332.350341796875, + "logps/ref_rejected": -342.4404602050781, + "logps/rejected": -393.659423828125, + "loss": 0.7647, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7256991863250732, + "rewards/grad_term": 0.022119037806987762, + "rewards/margins": 4.3961944580078125, + "rewards/rejected": -5.121893405914307, + "step": 12 + }, + { + "epoch": 0.0269674575392195, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 18.068273844723095, + "learning_rate": 5.606800887562651e-07, + "logits/chosen": 1.3912986516952515, + "logits/rejected": 1.4100464582443237, + "logps/accuracies": 0.625, + "logps/chosen": -360.9014587402344, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -349.53863525390625, + "logps/ref_rejected": -341.5389709472656, + "logps/rejected": -384.8885498046875, + "loss": 0.7998, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.136283040046692, + "rewards/grad_term": 0.016487201675772667, + "rewards/margins": 3.198676347732544, + "rewards/rejected": -4.334959506988525, + "step": 13 + }, + { + "epoch": 0.02904187734992869, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 27.645843878628447, + "learning_rate": 5.768795760029907e-07, + "logits/chosen": 1.4637022018432617, + "logits/rejected": 1.4576082229614258, + "logps/accuracies": 0.5625, + "logps/chosen": -319.2772216796875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -297.5706787109375, + "logps/ref_rejected": -267.2304382324219, + "logps/rejected": -324.9382629394531, + "loss": 0.7958, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1706531047821045, + "rewards/grad_term": 0.016141919419169426, + "rewards/margins": 3.600131034851074, + "rewards/rejected": -5.770784378051758, + "step": 14 + }, + { + "epoch": 0.031116297160637883, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 24.62720014572735, + "learning_rate": 5.919609378226007e-07, + "logits/chosen": 1.4202252626419067, + "logits/rejected": 1.5137040615081787, + "logps/accuracies": 0.8125, + "logps/chosen": -354.2460632324219, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -320.8343200683594, + "logps/ref_rejected": -324.75347900390625, + "logps/rejected": -385.1374816894531, + "loss": 0.8253, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.341172933578491, + "rewards/grad_term": 0.024179620668292046, + "rewards/margins": 2.697230815887451, + "rewards/rejected": -6.0384039878845215, + "step": 15 + }, + { + "epoch": 0.033190716971347074, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 24.048376194470723, + "learning_rate": 6.060686096385554e-07, + "logits/chosen": 1.3625872135162354, + "logits/rejected": 1.5674644708633423, + "logps/accuracies": 0.75, + "logps/chosen": -321.19952392578125, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -300.4372253417969, + "logps/ref_rejected": -299.97760009765625, + "logps/rejected": -369.1448974609375, + "loss": 0.7917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0762338638305664, + "rewards/grad_term": 0.01863047480583191, + "rewards/margins": 4.8404951095581055, + "rewards/rejected": -6.916728973388672, + "step": 16 + }, + { + "epoch": 0.03526513678205627, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 45.28780856236718, + "learning_rate": 6.193207302864632e-07, + "logits/chosen": 1.3500301837921143, + "logits/rejected": 1.3641669750213623, + "logps/accuracies": 0.75, + "logps/chosen": -257.4446105957031, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -241.6280517578125, + "logps/ref_rejected": -237.22329711914062, + "logps/rejected": -294.7928466796875, + "loss": 0.7765, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5816560983657837, + "rewards/grad_term": 0.019907817244529724, + "rewards/margins": 4.175297737121582, + "rewards/rejected": -5.756953239440918, + "step": 17 + }, + { + "epoch": 0.03733955659276546, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 33.38191088083146, + "learning_rate": 6.318151619802984e-07, + "logits/chosen": 1.1816256046295166, + "logits/rejected": 1.2595932483673096, + "logps/accuracies": 0.75, + "logps/chosen": -317.13775634765625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -302.21380615234375, + "logps/ref_rejected": -337.5311279296875, + "logps/rejected": -381.22796630859375, + "loss": 0.87, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4923958778381348, + "rewards/grad_term": 0.024127114564180374, + "rewards/margins": 2.877284288406372, + "rewards/rejected": -4.369679927825928, + "step": 18 + }, + { + "epoch": 0.03941397640347465, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 40.949935957102895, + "learning_rate": 6.436338804795301e-07, + "logits/chosen": 1.4050649404525757, + "logits/rejected": 1.4994277954101562, + "logps/accuracies": 0.875, + "logps/chosen": -292.9769287109375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -263.2729187011719, + "logps/ref_rejected": -291.3985290527344, + "logps/rejected": -359.13031005859375, + "loss": 0.8089, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9704017639160156, + "rewards/grad_term": 0.02432025596499443, + "rewards/margins": 3.802779197692871, + "rewards/rejected": -6.773180961608887, + "step": 19 + }, + { + "epoch": 0.04148839621418385, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 30.4272452916532, + "learning_rate": 6.548462378565487e-07, + "logits/chosen": 1.5738377571105957, + "logits/rejected": 1.5692741870880127, + "logps/accuracies": 0.8125, + "logps/chosen": -281.9122619628906, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -266.5782775878906, + "logps/ref_rejected": -265.3857421875, + "logps/rejected": -338.1681213378906, + "loss": 0.8287, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5333983898162842, + "rewards/grad_term": 0.01711239479482174, + "rewards/margins": 5.744836807250977, + "rewards/rejected": -7.278235912322998, + "step": 20 + }, + { + "epoch": 0.043562816024893036, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 21.481613213914592, + "learning_rate": 6.655114283786817e-07, + "logits/chosen": 1.495798945426941, + "logits/rejected": 1.575231909751892, + "logps/accuracies": 0.5625, + "logps/chosen": -329.2530517578125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -307.7621154785156, + "logps/ref_rejected": -305.4177551269531, + "logps/rejected": -353.7918395996094, + "loss": 0.7972, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1490931510925293, + "rewards/grad_term": 0.026765087619423866, + "rewards/margins": 2.6883203983306885, + "rewards/rejected": -4.8374128341674805, + "step": 21 + }, + { + "epoch": 0.04563723583560223, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 43.410701214740534, + "learning_rate": 6.7568038022143e-07, + "logits/chosen": 1.3690290451049805, + "logits/rejected": 1.4588748216629028, + "logps/accuracies": 0.75, + "logps/chosen": -303.6971740722656, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -295.21514892578125, + "logps/ref_rejected": -289.0858459472656, + "logps/rejected": -341.2356262207031, + "loss": 0.756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8482051491737366, + "rewards/grad_term": 0.01651611179113388, + "rewards/margins": 4.366776466369629, + "rewards/rejected": -5.214981555938721, + "step": 22 + }, + { + "epoch": 0.04771165564631142, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 40.534862406872996, + "learning_rate": 6.853972263303346e-07, + "logits/chosen": 1.460001826286316, + "logits/rejected": 1.4567062854766846, + "logps/accuracies": 0.625, + "logps/chosen": -357.6313171386719, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -350.6695251464844, + "logps/ref_rejected": -331.9695129394531, + "logps/rejected": -388.808349609375, + "loss": 0.7408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6961804628372192, + "rewards/grad_term": 0.018778638914227486, + "rewards/margins": 4.987700939178467, + "rewards/rejected": -5.683881759643555, + "step": 23 + }, + { + "epoch": 0.049786075457020615, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 25.219410545582974, + "learning_rate": 6.947004620142464e-07, + "logits/chosen": 1.494457483291626, + "logits/rejected": 1.583849310874939, + "logps/accuracies": 0.8125, + "logps/chosen": -303.1878662109375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -287.615234375, + "logps/ref_rejected": -305.63421630859375, + "logps/rejected": -363.3654479980469, + "loss": 0.741, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5572607517242432, + "rewards/grad_term": 0.01411459967494011, + "rewards/margins": 4.215861797332764, + "rewards/rejected": -5.773122787475586, + "step": 24 + }, + { + "epoch": 0.05186049526772981, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 21.34830172085934, + "learning_rate": 7.036238660745419e-07, + "logits/chosen": 1.2966912984848022, + "logits/rejected": 1.3436973094940186, + "logps/accuracies": 0.8125, + "logps/chosen": -318.70855712890625, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -309.7275390625, + "logps/ref_rejected": -321.34222412109375, + "logps/rejected": -374.23724365234375, + "loss": 0.7473, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8981032371520996, + "rewards/grad_term": 0.030619274824857712, + "rewards/margins": 4.391395092010498, + "rewards/rejected": -5.289497375488281, + "step": 25 + }, + { + "epoch": 0.053934915078439, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 24.70213159620223, + "learning_rate": 7.121972411659039e-07, + "logits/chosen": 1.527209997177124, + "logits/rejected": 1.5188902616500854, + "logps/accuracies": 0.75, + "logps/chosen": -311.9977722167969, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -312.7909240722656, + "logps/ref_rejected": -319.8678283691406, + "logps/rejected": -361.7353515625, + "loss": 0.7263, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07931968569755554, + "rewards/grad_term": 0.0128245297819376, + "rewards/margins": 4.26607084274292, + "rewards/rejected": -4.186751365661621, + "step": 26 + }, + { + "epoch": 0.056009334889148193, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 21.657435932051584, + "learning_rate": 7.204470143559894e-07, + "logits/chosen": 1.0803958177566528, + "logits/rejected": 1.182570219039917, + "logps/accuracies": 0.625, + "logps/chosen": -305.4505310058594, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -292.47894287109375, + "logps/ref_rejected": -291.8714904785156, + "logps/rejected": -341.31640625, + "loss": 0.708, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.29715895652771, + "rewards/grad_term": 0.02386583387851715, + "rewards/margins": 3.6473331451416016, + "rewards/rejected": -4.944491863250732, + "step": 27 + }, + { + "epoch": 0.05808375469985738, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 43.751484729145034, + "learning_rate": 7.283967284126295e-07, + "logits/chosen": 1.5686805248260498, + "logits/rejected": 1.6045427322387695, + "logps/accuracies": 0.5625, + "logps/chosen": -274.0627136230469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -275.16888427734375, + "logps/ref_rejected": -261.067138671875, + "logps/rejected": -293.029052734375, + "loss": 0.7257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11061999201774597, + "rewards/grad_term": 0.01599438488483429, + "rewards/margins": 3.306811571121216, + "rewards/rejected": -3.1961915493011475, + "step": 28 + }, + { + "epoch": 0.06015817451056658, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 31.4830012847814, + "learning_rate": 7.360674468418735e-07, + "logits/chosen": 1.3757277727127075, + "logits/rejected": 1.405045747756958, + "logps/accuracies": 0.6875, + "logps/chosen": -325.8724365234375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -329.0152893066406, + "logps/ref_rejected": -316.9151611328125, + "logps/rejected": -348.4771423339844, + "loss": 0.7116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3142804205417633, + "rewards/grad_term": 0.01970498077571392, + "rewards/margins": 3.47047758102417, + "rewards/rejected": -3.1561975479125977, + "step": 29 + }, + { + "epoch": 0.062232594321275765, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 43.903525037448524, + "learning_rate": 7.434780902322396e-07, + "logits/chosen": 1.2318979501724243, + "logits/rejected": 1.2543270587921143, + "logps/accuracies": 0.9375, + "logps/chosen": -299.3183288574219, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -293.3425598144531, + "logps/ref_rejected": -326.46270751953125, + "logps/rejected": -357.6190185546875, + "loss": 0.714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5975769758224487, + "rewards/grad_term": 0.024707140401005745, + "rewards/margins": 2.5180513858795166, + "rewards/rejected": -3.115628242492676, + "step": 30 + }, + { + "epoch": 0.06430701413198496, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 34.75148622057771, + "learning_rate": 7.506457174281587e-07, + "logits/chosen": 1.2120341062545776, + "logits/rejected": 1.220780849456787, + "logps/accuracies": 0.8125, + "logps/chosen": -327.4204406738281, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -317.1983337402344, + "logps/ref_rejected": -326.3907165527344, + "logps/rejected": -362.292724609375, + "loss": 0.7512, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0222113132476807, + "rewards/grad_term": 0.028988810256123543, + "rewards/margins": 2.5679914951324463, + "rewards/rejected": -3.590202808380127, + "step": 31 + }, + { + "epoch": 0.06638143394269415, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 24.42268750796852, + "learning_rate": 7.575857620481944e-07, + "logits/chosen": 1.2759184837341309, + "logits/rejected": 1.330209732055664, + "logps/accuracies": 0.75, + "logps/chosen": -352.6253967285156, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -354.62640380859375, + "logps/ref_rejected": -366.90618896484375, + "logps/rejected": -401.5226135253906, + "loss": 0.6974, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20009994506835938, + "rewards/grad_term": 0.021008620038628578, + "rewards/margins": 3.6617395877838135, + "rewards/rejected": -3.461639404296875, + "step": 32 + }, + { + "epoch": 0.06845585375340335, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 24.880719455281493, + "learning_rate": 7.643122325971209e-07, + "logits/chosen": 1.103371262550354, + "logits/rejected": 1.1177877187728882, + "logps/accuracies": 0.875, + "logps/chosen": -307.4759826660156, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -311.5048828125, + "logps/ref_rejected": -314.81597900390625, + "logps/rejected": -364.4283447265625, + "loss": 0.7078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4028913974761963, + "rewards/grad_term": 0.007870044559240341, + "rewards/margins": 5.364123821258545, + "rewards/rejected": -4.961232662200928, + "step": 33 + }, + { + "epoch": 0.07053027356411254, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 19.643933011479398, + "learning_rate": 7.708378826961021e-07, + "logits/chosen": 1.102717399597168, + "logits/rejected": 1.2488610744476318, + "logps/accuracies": 0.75, + "logps/chosen": -334.2718200683594, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -332.568603515625, + "logps/ref_rejected": -440.8373718261719, + "logps/rejected": -474.8569030761719, + "loss": 0.6622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.170322984457016, + "rewards/grad_term": 0.02305561862885952, + "rewards/margins": 3.231626510620117, + "rewards/rejected": -3.401949882507324, + "step": 34 + }, + { + "epoch": 0.07260469337482173, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 22.64669553529777, + "learning_rate": 7.771743566306228e-07, + "logits/chosen": 1.0699303150177002, + "logits/rejected": 1.0104891061782837, + "logps/accuracies": 0.5625, + "logps/chosen": -356.24755859375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -349.8689880371094, + "logps/ref_rejected": -338.50445556640625, + "logps/rejected": -369.7318115234375, + "loss": 0.7305, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6378520131111145, + "rewards/grad_term": 0.030929066240787506, + "rewards/margins": 2.4848828315734863, + "rewards/rejected": -3.1227352619171143, + "step": 35 + }, + { + "epoch": 0.07467911318553092, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 27.940317128258794, + "learning_rate": 7.833323143899373e-07, + "logits/chosen": 0.8235185146331787, + "logits/rejected": 0.8534129858016968, + "logps/accuracies": 0.75, + "logps/chosen": -311.9441833496094, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -311.42791748046875, + "logps/ref_rejected": -301.2667541503906, + "logps/rejected": -360.82537841796875, + "loss": 0.6602, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05162731558084488, + "rewards/grad_term": 0.01031492929905653, + "rewards/margins": 5.904232025146484, + "rewards/rejected": -5.955859661102295, + "step": 36 + }, + { + "epoch": 0.07675353299624012, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 59.21163950114599, + "learning_rate": 7.893215395709077e-07, + "logits/chosen": 0.6561946868896484, + "logits/rejected": 0.7136399745941162, + "logps/accuracies": 0.875, + "logps/chosen": -280.95098876953125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -277.3229064941406, + "logps/ref_rejected": -273.8558349609375, + "logps/rejected": -336.818115234375, + "loss": 0.6433, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.36280834674835205, + "rewards/grad_term": 0.008878666907548904, + "rewards/margins": 5.933422088623047, + "rewards/rejected": -6.296230316162109, + "step": 37 + }, + { + "epoch": 0.0788279528069493, + "flips/correct->correct": 0.125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 49.21026179278685, + "learning_rate": 7.951510328891689e-07, + "logits/chosen": 1.0223195552825928, + "logits/rejected": 0.9707791209220886, + "logps/accuracies": 0.5, + "logps/chosen": -251.95631408691406, + "logps/ref_accuracies": 0.125, + "logps/ref_chosen": -241.30072021484375, + "logps/ref_rejected": -225.43099975585938, + "logps/rejected": -282.074951171875, + "loss": 0.6902, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0655571222305298, + "rewards/grad_term": 0.019987476989626884, + "rewards/margins": 4.59883975982666, + "rewards/rejected": -5.664397239685059, + "step": 38 + }, + { + "epoch": 0.0809023726176585, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 27.79357547798861, + "learning_rate": 8.008290935415948e-07, + "logits/chosen": 0.7353692650794983, + "logits/rejected": 0.8016875386238098, + "logps/accuracies": 0.8125, + "logps/chosen": -287.4935607910156, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -283.6936950683594, + "logps/ref_rejected": -283.30487060546875, + "logps/rejected": -337.9640197753906, + "loss": 0.6843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3799862861633301, + "rewards/grad_term": 0.01074125524610281, + "rewards/margins": 5.085926055908203, + "rewards/rejected": -5.465912818908691, + "step": 39 + }, + { + "epoch": 0.0829767924283677, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 52.06939047981281, + "learning_rate": 8.063633902661875e-07, + "logits/chosen": 0.9137625694274902, + "logits/rejected": 0.9001289010047913, + "logps/accuracies": 0.8125, + "logps/chosen": -308.0149841308594, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -298.7401428222656, + "logps/ref_rejected": -289.8734436035156, + "logps/rejected": -362.60211181640625, + "loss": 0.6861, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9274865984916687, + "rewards/grad_term": 0.015340002253651619, + "rewards/margins": 6.3453826904296875, + "rewards/rejected": -7.272869110107422, + "step": 40 + }, + { + "epoch": 0.08505121223907688, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 30.550457494843638, + "learning_rate": 8.117610236262845e-07, + "logits/chosen": 0.7508188486099243, + "logits/rejected": 0.8092616200447083, + "logps/accuracies": 0.8125, + "logps/chosen": -344.6716613769531, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -332.699462890625, + "logps/ref_rejected": -344.08209228515625, + "logps/rejected": -386.7120056152344, + "loss": 0.7017, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1972177028656006, + "rewards/grad_term": 0.02788725309073925, + "rewards/margins": 3.065772771835327, + "rewards/rejected": -4.262990474700928, + "step": 41 + }, + { + "epoch": 0.08712563204978607, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 25.64415652049263, + "learning_rate": 8.170285807883206e-07, + "logits/chosen": 0.6477910876274109, + "logits/rejected": 0.8107466697692871, + "logps/accuracies": 0.8125, + "logps/chosen": -261.5460205078125, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -256.37158203125, + "logps/ref_rejected": -280.9100646972656, + "logps/rejected": -316.6261901855469, + "loss": 0.6765, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5174452066421509, + "rewards/grad_term": 0.02346464805305004, + "rewards/margins": 3.054164409637451, + "rewards/rejected": -3.5716099739074707, + "step": 42 + }, + { + "epoch": 0.08920005186049526, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 35.35935314371997, + "learning_rate": 8.221721838532495e-07, + "logits/chosen": 0.6840221285820007, + "logits/rejected": 0.6609375476837158, + "logps/accuracies": 0.6875, + "logps/chosen": -291.4024353027344, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -296.0429992675781, + "logps/ref_rejected": -282.0372009277344, + "logps/rejected": -319.9362487792969, + "loss": 0.5996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4640587866306305, + "rewards/grad_term": 0.011877249926328659, + "rewards/margins": 4.253963470458984, + "rewards/rejected": -3.7899045944213867, + "step": 43 + }, + { + "epoch": 0.09127447167120446, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 28.87938199876403, + "learning_rate": 8.271975326310688e-07, + "logits/chosen": 0.8031829595565796, + "logits/rejected": 0.7779420614242554, + "logps/accuracies": 0.5625, + "logps/chosen": -308.4298095703125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -299.1800537109375, + "logps/ref_rejected": -301.2965087890625, + "logps/rejected": -333.80767822265625, + "loss": 0.65, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9249745607376099, + "rewards/grad_term": 0.028043199330568314, + "rewards/margins": 2.326139450073242, + "rewards/rejected": -3.2511138916015625, + "step": 44 + }, + { + "epoch": 0.09334889148191365, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 27.45541909883828, + "learning_rate": 8.321099426079305e-07, + "logits/chosen": 0.6578277349472046, + "logits/rejected": 0.7826619148254395, + "logps/accuracies": 0.875, + "logps/chosen": -284.1718444824219, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -278.0632019042969, + "logps/ref_rejected": -310.51953125, + "logps/rejected": -362.188720703125, + "loss": 0.6724, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6108630895614624, + "rewards/grad_term": 0.015516946092247963, + "rewards/margins": 4.556060791015625, + "rewards/rejected": -5.1669230461120605, + "step": 45 + }, + { + "epoch": 0.09542331129262284, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 59.28118284951873, + "learning_rate": 8.369143787399735e-07, + "logits/chosen": 0.9397487044334412, + "logits/rejected": 0.9460306167602539, + "logps/accuracies": 0.75, + "logps/chosen": -250.52760314941406, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -249.87916564941406, + "logps/ref_rejected": -253.21328735351562, + "logps/rejected": -280.88421630859375, + "loss": 0.7086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0648447573184967, + "rewards/grad_term": 0.024975696578621864, + "rewards/margins": 2.702247142791748, + "rewards/rejected": -2.7670915126800537, + "step": 46 + }, + { + "epoch": 0.09749773110333204, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 27.055341319651774, + "learning_rate": 8.416154856125216e-07, + "logits/chosen": 0.8418172597885132, + "logits/rejected": 0.8614631295204163, + "logps/accuracies": 0.6875, + "logps/chosen": -289.743408203125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -288.4799499511719, + "logps/ref_rejected": -295.8186340332031, + "logps/rejected": -333.06170654296875, + "loss": 0.648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12634362280368805, + "rewards/grad_term": 0.015468433499336243, + "rewards/margins": 3.597963571548462, + "rewards/rejected": -3.724307060241699, + "step": 47 + }, + { + "epoch": 0.09957215091404123, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 29.320128400390026, + "learning_rate": 8.462176144238853e-07, + "logits/chosen": 1.0445611476898193, + "logits/rejected": 1.080256700515747, + "logps/accuracies": 0.875, + "logps/chosen": -277.6840515136719, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -284.16644287109375, + "logps/ref_rejected": -313.25799560546875, + "logps/rejected": -363.92913818359375, + "loss": 0.6148, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.648241400718689, + "rewards/grad_term": 0.01066309679299593, + "rewards/margins": 5.71535062789917, + "rewards/rejected": -5.067109107971191, + "step": 48 + }, + { + "epoch": 0.10164657072475042, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 30.36633688848658, + "learning_rate": 8.507248471867036e-07, + "logits/chosen": 1.0277738571166992, + "logits/rejected": 0.9966739416122437, + "logps/accuracies": 0.625, + "logps/chosen": -354.4734191894531, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -353.09027099609375, + "logps/ref_rejected": -352.4345397949219, + "logps/rejected": -385.0770263671875, + "loss": 0.6356, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13831399381160736, + "rewards/grad_term": 0.021235931664705276, + "rewards/margins": 3.1259407997131348, + "rewards/rejected": -3.264254570007324, + "step": 49 + }, + { + "epoch": 0.10372099053545962, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 34.234438693372084, + "learning_rate": 8.551410184841808e-07, + "logits/chosen": 0.8633083701133728, + "logits/rejected": 0.860028088092804, + "logps/accuracies": 0.6875, + "logps/chosen": -252.79647827148438, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -252.6006317138672, + "logps/ref_rejected": -258.76251220703125, + "logps/rejected": -295.47882080078125, + "loss": 0.65, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01958458498120308, + "rewards/grad_term": 0.01410377025604248, + "rewards/margins": 3.6520471572875977, + "rewards/rejected": -3.6716315746307373, + "step": 50 + }, + { + "epoch": 0.10579541034616881, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 29.660260139951582, + "learning_rate": 8.59469735071793e-07, + "logits/chosen": 0.38166430592536926, + "logits/rejected": 0.4328911304473877, + "logps/accuracies": 0.625, + "logps/chosen": -296.5737609863281, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -290.32611083984375, + "logps/ref_rejected": -293.0495910644531, + "logps/rejected": -349.3868408203125, + "loss": 0.6285, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6247656941413879, + "rewards/grad_term": 0.017583010718226433, + "rewards/margins": 5.008961200714111, + "rewards/rejected": -5.633727073669434, + "step": 51 + }, + { + "epoch": 0.107869830156878, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 25.949283947333324, + "learning_rate": 8.637143935755428e-07, + "logits/chosen": 0.727641224861145, + "logits/rejected": 0.7505197525024414, + "logps/accuracies": 0.625, + "logps/chosen": -288.1712646484375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -278.9561767578125, + "logps/ref_rejected": -265.6705322265625, + "logps/rejected": -308.8449401855469, + "loss": 0.6146, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9215071201324463, + "rewards/grad_term": 0.022838197648525238, + "rewards/margins": 3.395932912826538, + "rewards/rejected": -4.317440032958984, + "step": 52 + }, + { + "epoch": 0.10994424996758718, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 24.134309233597588, + "learning_rate": 8.678781965043402e-07, + "logits/chosen": 0.7036612033843994, + "logits/rejected": 0.6557080745697021, + "logps/accuracies": 0.875, + "logps/chosen": -358.0240478515625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -348.71356201171875, + "logps/ref_rejected": -355.6617431640625, + "logps/rejected": -404.3010559082031, + "loss": 0.6916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9310531616210938, + "rewards/grad_term": 0.02438879944384098, + "rewards/margins": 3.93287992477417, + "rewards/rejected": -4.863933086395264, + "step": 53 + }, + { + "epoch": 0.11201866977829639, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 53.64626463678004, + "learning_rate": 8.719641667656282e-07, + "logits/chosen": 0.6714786887168884, + "logits/rejected": 0.5823845863342285, + "logps/accuracies": 0.625, + "logps/chosen": -376.20220947265625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -369.01507568359375, + "logps/ref_rejected": -328.4320373535156, + "logps/rejected": -383.90655517578125, + "loss": 0.6963, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7187104821205139, + "rewards/grad_term": 0.011453664861619473, + "rewards/margins": 4.828742980957031, + "rewards/rejected": -5.5474534034729, + "step": 54 + }, + { + "epoch": 0.11409308958900558, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 70.929778345069, + "learning_rate": 8.759751608490621e-07, + "logits/chosen": 0.44098129868507385, + "logits/rejected": 0.5110803842544556, + "logps/accuracies": 0.8125, + "logps/chosen": -307.63323974609375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -303.2783203125, + "logps/ref_rejected": -305.9503173828125, + "logps/rejected": -365.0198669433594, + "loss": 0.6622, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4354937672615051, + "rewards/grad_term": 0.016174456104636192, + "rewards/margins": 5.471461296081543, + "rewards/rejected": -5.906955242156982, + "step": 55 + }, + { + "epoch": 0.11616750939971476, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 26.619296281410804, + "learning_rate": 8.799138808222686e-07, + "logits/chosen": 0.7330751419067383, + "logits/rejected": 0.9024415016174316, + "logps/accuracies": 0.75, + "logps/chosen": -235.89239501953125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -231.52980041503906, + "logps/ref_rejected": -262.5042724609375, + "logps/rejected": -304.2793273925781, + "loss": 0.6587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43625733256340027, + "rewards/grad_term": 0.020225245505571365, + "rewards/margins": 3.7412445545196533, + "rewards/rejected": -4.177502155303955, + "step": 56 + }, + { + "epoch": 0.11824192921042397, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 27.602722271896948, + "learning_rate": 8.837828852648599e-07, + "logits/chosen": 0.5326017737388611, + "logits/rejected": 0.6437039971351624, + "logps/accuracies": 0.8125, + "logps/chosen": -301.2654113769531, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -306.17083740234375, + "logps/ref_rejected": -299.8456115722656, + "logps/rejected": -362.2225036621094, + "loss": 0.6253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4905431568622589, + "rewards/grad_term": 0.014911260455846786, + "rewards/margins": 6.728227615356445, + "rewards/rejected": -6.2376837730407715, + "step": 57 + }, + { + "epoch": 0.12031634902113315, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 28.670654744865576, + "learning_rate": 8.875845992515123e-07, + "logits/chosen": 0.38607218861579895, + "logits/rejected": 0.414478600025177, + "logps/accuracies": 0.625, + "logps/chosen": -328.02618408203125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -322.28265380859375, + "logps/ref_rejected": -297.2394104003906, + "logps/rejected": -336.27459716796875, + "loss": 0.6757, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5743532180786133, + "rewards/grad_term": 0.023728108033537865, + "rewards/margins": 3.329164505004883, + "rewards/rejected": -3.903517961502075, + "step": 58 + }, + { + "epoch": 0.12239076883184234, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 31.46030833209825, + "learning_rate": 8.91321323481661e-07, + "logits/chosen": 0.6807994246482849, + "logits/rejected": 0.7104217410087585, + "logps/accuracies": 0.8125, + "logps/chosen": -331.4752502441406, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -335.2398986816406, + "logps/ref_rejected": -332.1490173339844, + "logps/rejected": -374.0831298828125, + "loss": 0.6771, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.37646305561065674, + "rewards/grad_term": 0.010923169553279877, + "rewards/margins": 4.56987190246582, + "rewards/rejected": -4.193408966064453, + "step": 59 + }, + { + "epoch": 0.12446518864255153, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 23.984091347684366, + "learning_rate": 8.949952426418784e-07, + "logits/chosen": 0.568733811378479, + "logits/rejected": 0.635265052318573, + "logps/accuracies": 0.6875, + "logps/chosen": -397.9205322265625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -402.82952880859375, + "logps/ref_rejected": -363.6296691894531, + "logps/rejected": -400.825439453125, + "loss": 0.6358, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4909006357192993, + "rewards/grad_term": 0.015072712674736977, + "rewards/margins": 4.210475921630859, + "rewards/rejected": -3.7195756435394287, + "step": 60 + }, + { + "epoch": 0.12653960845326073, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.125, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 73.64774037078915, + "learning_rate": 8.986084330770518e-07, + "logits/chosen": 0.7834938764572144, + "logits/rejected": 0.8703972101211548, + "logps/accuracies": 0.6875, + "logps/chosen": -256.0115661621094, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -239.9134521484375, + "logps/ref_rejected": -261.8016662597656, + "logps/rejected": -313.2479248046875, + "loss": 0.6474, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6098082065582275, + "rewards/grad_term": 0.01827179826796055, + "rewards/margins": 3.534818172454834, + "rewards/rejected": -5.144626140594482, + "step": 61 + }, + { + "epoch": 0.12861402826396992, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 25.70529599001306, + "learning_rate": 9.021628698377976e-07, + "logits/chosen": 0.5873112082481384, + "logits/rejected": 0.6506080627441406, + "logps/accuracies": 0.75, + "logps/chosen": -274.6400451660156, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -276.3179016113281, + "logps/ref_rejected": -279.714599609375, + "logps/rejected": -340.3433532714844, + "loss": 0.6359, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16778606176376343, + "rewards/grad_term": 0.014593811705708504, + "rewards/margins": 6.230656623840332, + "rewards/rejected": -6.062870979309082, + "step": 62 + }, + { + "epoch": 0.1306884480746791, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 29.57488992234054, + "learning_rate": 9.056604331640114e-07, + "logits/chosen": 0.511448323726654, + "logits/rejected": 0.4164316654205322, + "logps/accuracies": 0.75, + "logps/chosen": -254.2750244140625, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -253.3539276123047, + "logps/ref_rejected": -264.7762145996094, + "logps/rejected": -299.82305908203125, + "loss": 0.6508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09211038053035736, + "rewards/grad_term": 0.026033716276288033, + "rewards/margins": 3.4125752449035645, + "rewards/rejected": -3.504685401916504, + "step": 63 + }, + { + "epoch": 0.1327628678853883, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 55.6766766699201, + "learning_rate": 9.091029144578332e-07, + "logits/chosen": 0.5473611354827881, + "logits/rejected": 0.6334167122840881, + "logps/accuracies": 0.8125, + "logps/chosen": -307.349365234375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -308.5955505371094, + "logps/ref_rejected": -327.8055725097656, + "logps/rejected": -374.07574462890625, + "loss": 0.6354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12462212890386581, + "rewards/grad_term": 0.011786059476435184, + "rewards/margins": 4.751638412475586, + "rewards/rejected": -4.627016067504883, + "step": 64 + }, + { + "epoch": 0.1327628678853883, + "eval_flips/correct->correct": 0.43842363357543945, + "eval_flips/correct->incorrect": 0.004926108289510012, + "eval_flips/incorrect->correct": 0.2660098373889923, + "eval_flips/incorrect->incorrect": 0.29064038395881653, + "eval_logits/chosen": 0.5654913783073425, + "eval_logits/rejected": 0.6160324215888977, + "eval_logps/accuracies": 0.7044335007667542, + "eval_logps/chosen": -288.4407958984375, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -328.46038818359375, + "eval_loss": 0.6570103168487549, + "eval_rewards/accuracies": 0.8325123190879822, + "eval_rewards/chosen": -0.10896830260753632, + "eval_rewards/grad_term": 0.021043213084340096, + "eval_rewards/margins": 3.8324687480926514, + "eval_rewards/rejected": -3.9414374828338623, + "eval_runtime": 786.9931, + "eval_samples_per_second": 2.056, + "eval_steps_per_second": 0.258, + "step": 64 + }, + { + "epoch": 0.13483728769609749, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 24.735517627215668, + "learning_rate": 9.124920217935358e-07, + "logits/chosen": 0.40278834104537964, + "logits/rejected": 0.4163047969341278, + "logps/accuracies": 0.875, + "logps/chosen": -353.63824462890625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -357.9703369140625, + "logps/ref_rejected": -365.9423522949219, + "logps/rejected": -425.2349853515625, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4332119822502136, + "rewards/grad_term": 0.007745261769741774, + "rewards/margins": 6.362478256225586, + "rewards/rejected": -5.929266452789307, + "step": 65 + }, + { + "epoch": 0.1369117075068067, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 26.834165183727563, + "learning_rate": 9.158293850067597e-07, + "logits/chosen": 0.387469083070755, + "logits/rejected": 0.4058898091316223, + "logps/accuracies": 0.8125, + "logps/chosen": -252.04205322265625, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -252.20950317382812, + "logps/ref_rejected": -263.31280517578125, + "logps/rejected": -316.500244140625, + "loss": 0.6308, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.01674594357609749, + "rewards/grad_term": 0.014994516968727112, + "rewards/margins": 5.335488319396973, + "rewards/rejected": -5.318742275238037, + "step": 66 + }, + { + "epoch": 0.1389861273175159, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 31.0308823025867, + "learning_rate": 9.191165604010531e-07, + "logits/chosen": 0.3395693302154541, + "logits/rejected": 0.34473684430122375, + "logps/accuracies": 0.75, + "logps/chosen": -325.09197998046875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -328.00286865234375, + "logps/ref_rejected": -305.96258544921875, + "logps/rejected": -359.0819396972656, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2910885214805603, + "rewards/grad_term": 0.009055268950760365, + "rewards/margins": 5.603026390075684, + "rewards/rejected": -5.3119378089904785, + "step": 67 + }, + { + "epoch": 0.14106054712822508, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 26.519671253661027, + "learning_rate": 9.22355035105741e-07, + "logits/chosen": 0.4188442528247833, + "logits/rejected": 0.4437766969203949, + "logps/accuracies": 0.6875, + "logps/chosen": -293.8087463378906, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -280.9673156738281, + "logps/ref_rejected": -302.37518310546875, + "logps/rejected": -354.1598815917969, + "loss": 0.619, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.284143090248108, + "rewards/grad_term": 0.02901587449014187, + "rewards/margins": 3.8943264484405518, + "rewards/rejected": -5.178469657897949, + "step": 68 + }, + { + "epoch": 0.14313496693893427, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 30.502044803594153, + "learning_rate": 9.255462311156644e-07, + "logits/chosen": 0.5335452556610107, + "logits/rejected": 0.5705280303955078, + "logps/accuracies": 0.5625, + "logps/chosen": -320.91192626953125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -304.9333190917969, + "logps/ref_rejected": -281.81768798828125, + "logps/rejected": -346.9627685546875, + "loss": 0.6755, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5978612899780273, + "rewards/grad_term": 0.018111273646354675, + "rewards/margins": 4.916650295257568, + "rewards/rejected": -6.514511585235596, + "step": 69 + }, + { + "epoch": 0.14520938674964345, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 39.33816705000978, + "learning_rate": 9.286915090402617e-07, + "logits/chosen": 0.4920622706413269, + "logits/rejected": 0.5008682012557983, + "logps/accuracies": 0.8125, + "logps/chosen": -302.3096618652344, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -291.40972900390625, + "logps/ref_rejected": -286.4915771484375, + "logps/rejected": -359.8939514160156, + "loss": 0.6369, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0899897813796997, + "rewards/grad_term": 0.0171764325350523, + "rewards/margins": 6.250240325927734, + "rewards/rejected": -7.340230464935303, + "step": 70 + }, + { + "epoch": 0.14728380656035264, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 32.47379922437437, + "learning_rate": 9.317921715867286e-07, + "logits/chosen": 0.5690668225288391, + "logits/rejected": 0.6497770547866821, + "logps/accuracies": 0.75, + "logps/chosen": -300.4138488769531, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -293.0126647949219, + "logps/ref_rejected": -293.5539855957031, + "logps/rejected": -361.0144348144531, + "loss": 0.6126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7401193976402283, + "rewards/grad_term": 0.012420150451362133, + "rewards/margins": 6.005928039550781, + "rewards/rejected": -6.746047019958496, + "step": 71 + }, + { + "epoch": 0.14935822637106183, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 35.11526252181687, + "learning_rate": 9.348494667995762e-07, + "logits/chosen": 0.5223222970962524, + "logits/rejected": 0.6166201829910278, + "logps/accuracies": 0.875, + "logps/chosen": -262.4486083984375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -247.41668701171875, + "logps/ref_rejected": -251.63619995117188, + "logps/rejected": -323.4156494140625, + "loss": 0.6372, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.503194808959961, + "rewards/grad_term": 0.016301069408655167, + "rewards/margins": 5.674752712249756, + "rewards/rejected": -7.177947521209717, + "step": 72 + }, + { + "epoch": 0.15143264618177105, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 33.25143057906705, + "learning_rate": 9.378645910767493e-07, + "logits/chosen": 0.5215972065925598, + "logits/rejected": 0.4775215685367584, + "logps/accuracies": 0.6875, + "logps/chosen": -257.8221435546875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -257.3147888183594, + "logps/ref_rejected": -245.8674774169922, + "logps/rejected": -302.9472961425781, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.050737857818603516, + "rewards/grad_term": 0.01016687136143446, + "rewards/margins": 5.657248497009277, + "rewards/rejected": -5.707986831665039, + "step": 73 + }, + { + "epoch": 0.15350706599248023, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.6875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 52.55742381973749, + "learning_rate": 9.408386919805467e-07, + "logits/chosen": 0.7360602021217346, + "logits/rejected": 0.70041424036026, + "logps/accuracies": 0.9375, + "logps/chosen": -317.7826843261719, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -302.98712158203125, + "logps/ref_rejected": -267.1181945800781, + "logps/rejected": -356.98870849609375, + "loss": 0.6432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4795535802841187, + "rewards/grad_term": 0.008950343355536461, + "rewards/margins": 7.507498741149902, + "rewards/rejected": -8.987051963806152, + "step": 74 + }, + { + "epoch": 0.15558148580318942, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 26.35632884945645, + "learning_rate": 9.437728708598716e-07, + "logits/chosen": 0.3639271855354309, + "logits/rejected": 0.38472047448158264, + "logps/accuracies": 0.875, + "logps/chosen": -278.147216796875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -264.1582336425781, + "logps/ref_rejected": -274.158203125, + "logps/rejected": -352.51422119140625, + "loss": 0.6529, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3989008665084839, + "rewards/grad_term": 0.011926427483558655, + "rewards/margins": 6.436697959899902, + "rewards/rejected": -7.835598945617676, + "step": 75 + }, + { + "epoch": 0.1576559056138986, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 43.67163684751143, + "learning_rate": 9.466681852988078e-07, + "logits/chosen": 0.6780661344528198, + "logits/rejected": 0.7738847732543945, + "logps/accuracies": 0.75, + "logps/chosen": -286.3451843261719, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -271.2528991699219, + "logps/ref_rejected": -271.053955078125, + "logps/rejected": -328.85772705078125, + "loss": 0.6067, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5092324018478394, + "rewards/grad_term": 0.023612529039382935, + "rewards/margins": 4.271145820617676, + "rewards/rejected": -5.780378341674805, + "step": 76 + }, + { + "epoch": 0.1597303254246078, + "flips/correct->correct": 0.875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 22.36636463977758, + "learning_rate": 9.495256514051431e-07, + "logits/chosen": 0.4788045287132263, + "logits/rejected": 0.549846887588501, + "logps/accuracies": 1.0, + "logps/chosen": -222.5209197998047, + "logps/ref_accuracies": 0.875, + "logps/ref_chosen": -207.42868041992188, + "logps/ref_rejected": -236.1974334716797, + "logps/rejected": -293.5431823730469, + "loss": 0.6448, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5092250108718872, + "rewards/grad_term": 0.0173178743571043, + "rewards/margins": 4.225347995758057, + "rewards/rejected": -5.734574317932129, + "step": 77 + }, + { + "epoch": 0.161804745235317, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 25.267712307372168, + "learning_rate": 9.523462459512337e-07, + "logits/chosen": 0.5372971892356873, + "logits/rejected": 0.6544579863548279, + "logps/accuracies": 0.9375, + "logps/chosen": -278.4732360839844, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -275.07958984375, + "logps/ref_rejected": -292.14898681640625, + "logps/rejected": -352.7454833984375, + "loss": 0.6166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3393683433532715, + "rewards/grad_term": 0.022649819031357765, + "rewards/margins": 5.720274925231934, + "rewards/rejected": -6.059643745422363, + "step": 78 + }, + { + "epoch": 0.16387916504602618, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 57.52603839415039, + "learning_rate": 9.551309083784976e-07, + "logits/chosen": 0.6397267580032349, + "logits/rejected": 0.7187516093254089, + "logps/accuracies": 0.9375, + "logps/chosen": -273.272705078125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -278.8054504394531, + "logps/ref_rejected": -292.9872741699219, + "logps/rejected": -340.0445861816406, + "loss": 0.6701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5532730221748352, + "rewards/grad_term": 0.014312355779111385, + "rewards/margins": 5.259001731872559, + "rewards/rejected": -4.705729007720947, + "step": 79 + }, + { + "epoch": 0.1659535848567354, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 50.777119418812084, + "learning_rate": 9.578805426758263e-07, + "logits/chosen": 0.4606146216392517, + "logits/rejected": 0.46222275495529175, + "logps/accuracies": 0.8125, + "logps/chosen": -292.800537109375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -291.5415954589844, + "logps/ref_rejected": -313.3748474121094, + "logps/rejected": -364.9443054199219, + "loss": 0.6283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1258973628282547, + "rewards/grad_term": 0.018451694399118423, + "rewards/margins": 5.031045436859131, + "rewards/rejected": -5.156942367553711, + "step": 80 + }, + { + "epoch": 0.16802800466744458, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 25.904014404983347, + "learning_rate": 9.605960191413192e-07, + "logits/chosen": 0.5609871745109558, + "logits/rejected": 0.646887481212616, + "logps/accuracies": 0.6875, + "logps/chosen": -388.10205078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -396.9491271972656, + "logps/ref_rejected": -395.2713928222656, + "logps/rejected": -423.269287109375, + "loss": 0.5963, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8847097158432007, + "rewards/grad_term": 0.024479346349835396, + "rewards/margins": 3.6844961643218994, + "rewards/rejected": -2.7997865676879883, + "step": 81 + }, + { + "epoch": 0.17010242447815377, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 18.071356978636363, + "learning_rate": 9.632781760359235e-07, + "logits/chosen": 0.2946923077106476, + "logits/rejected": 0.26006707549095154, + "logps/accuracies": 0.6875, + "logps/chosen": -222.20687866210938, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -218.695068359375, + "logps/ref_rejected": -223.76553344726562, + "logps/rejected": -264.6587829589844, + "loss": 0.6335, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3511809706687927, + "rewards/grad_term": 0.025286730378866196, + "rewards/margins": 3.7381458282470703, + "rewards/rejected": -4.08932638168335, + "step": 82 + }, + { + "epoch": 0.17217684428886296, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 29.580870064695095, + "learning_rate": 9.659278211368498e-07, + "logits/chosen": 0.653415322303772, + "logits/rejected": 0.7497892379760742, + "logps/accuracies": 0.875, + "logps/chosen": -334.653564453125, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -324.0084533691406, + "logps/ref_rejected": -340.58624267578125, + "logps/rejected": -422.7427978515625, + "loss": 0.6484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0645086765289307, + "rewards/grad_term": 0.018634023144841194, + "rewards/margins": 7.15114688873291, + "rewards/rejected": -8.215656280517578, + "step": 83 + }, + { + "epoch": 0.17425126409957215, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 37.56856528542604, + "learning_rate": 9.685457331979593e-07, + "logits/chosen": 0.7688320875167847, + "logits/rejected": 0.913873553276062, + "logps/accuracies": 0.8125, + "logps/chosen": -252.0431671142578, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -241.2303466796875, + "logps/ref_rejected": -278.7004699707031, + "logps/rejected": -341.7118835449219, + "loss": 0.6808, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0812804698944092, + "rewards/grad_term": 0.022220587357878685, + "rewards/margins": 5.219861030578613, + "rewards/rejected": -6.301141738891602, + "step": 84 + }, + { + "epoch": 0.17632568391028133, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 25.08015845081592, + "learning_rate": 9.711326633237342e-07, + "logits/chosen": 0.6746060252189636, + "logits/rejected": 0.6128141283988953, + "logps/accuracies": 0.8125, + "logps/chosen": -324.61865234375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -314.5938415527344, + "logps/ref_rejected": -327.64666748046875, + "logps/rejected": -388.7850036621094, + "loss": 0.58, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.002484679222107, + "rewards/grad_term": 0.017657626420259476, + "rewards/margins": 5.111349582672119, + "rewards/rejected": -6.113834857940674, + "step": 85 + }, + { + "epoch": 0.17840010372099052, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 28.420792350466737, + "learning_rate": 9.736893362628883e-07, + "logits/chosen": 0.49216994643211365, + "logits/rejected": 0.5920721888542175, + "logps/accuracies": 0.9375, + "logps/chosen": -299.5179443359375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -294.0077209472656, + "logps/ref_rejected": -302.6850280761719, + "logps/rejected": -385.48388671875, + "loss": 0.6414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5510200262069702, + "rewards/grad_term": 0.004014983773231506, + "rewards/margins": 7.728863716125488, + "rewards/rejected": -8.279884338378906, + "step": 86 + }, + { + "epoch": 0.18047452353169974, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 32.17317573293341, + "learning_rate": 9.762164516272033e-07, + "logits/chosen": 0.7234176397323608, + "logits/rejected": 0.7146831154823303, + "logps/accuracies": 0.9375, + "logps/chosen": -299.3135681152344, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -294.986083984375, + "logps/ref_rejected": -306.23895263671875, + "logps/rejected": -362.81964111328125, + "loss": 0.6571, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43274781107902527, + "rewards/grad_term": 0.012466475367546082, + "rewards/margins": 5.2253193855285645, + "rewards/rejected": -5.658066749572754, + "step": 87 + }, + { + "epoch": 0.18254894334240893, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 26.897654394769035, + "learning_rate": 9.787146850407078e-07, + "logits/chosen": 0.47364750504493713, + "logits/rejected": 0.5636922717094421, + "logps/accuracies": 0.75, + "logps/chosen": -264.1487121582031, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -270.39495849609375, + "logps/ref_rejected": -258.71246337890625, + "logps/rejected": -319.8270263671875, + "loss": 0.6117, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6246242523193359, + "rewards/grad_term": 0.01352761872112751, + "rewards/margins": 6.736079216003418, + "rewards/rejected": -6.111454963684082, + "step": 88 + }, + { + "epoch": 0.18462336315311811, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 36.70610652642651, + "learning_rate": 9.811846892239293e-07, + "logits/chosen": 0.1739477515220642, + "logits/rejected": 0.20079316198825836, + "logps/accuracies": 0.875, + "logps/chosen": -334.56201171875, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -319.6775207519531, + "logps/ref_rejected": -328.6192626953125, + "logps/rejected": -389.33929443359375, + "loss": 0.5805, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4884480237960815, + "rewards/grad_term": 0.028538305312395096, + "rewards/margins": 4.583554267883301, + "rewards/rejected": -6.072002410888672, + "step": 89 + }, + { + "epoch": 0.1866977829638273, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 38.62477740343346, + "learning_rate": 9.836270950175693e-07, + "logits/chosen": 0.5048727989196777, + "logits/rejected": 0.5224493741989136, + "logps/accuracies": 0.875, + "logps/chosen": -265.7325439453125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -250.5098419189453, + "logps/ref_rejected": -255.43650817871094, + "logps/rejected": -315.4735107421875, + "loss": 0.6476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5222673416137695, + "rewards/grad_term": 0.023190699517726898, + "rewards/margins": 4.481435298919678, + "rewards/rejected": -6.003702640533447, + "step": 90 + }, + { + "epoch": 0.1887722027745365, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 36.7027529146051, + "learning_rate": 9.860425123496167e-07, + "logits/chosen": 0.5219244360923767, + "logits/rejected": 0.5849474668502808, + "logps/accuracies": 0.9375, + "logps/chosen": -240.11685180664062, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -234.85340881347656, + "logps/ref_rejected": -262.1112060546875, + "logps/rejected": -327.15716552734375, + "loss": 0.6069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5263462662696838, + "rewards/grad_term": 0.004026439506560564, + "rewards/margins": 5.978251934051514, + "rewards/rejected": -6.504598140716553, + "step": 91 + }, + { + "epoch": 0.19084662258524568, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 34.422228673025685, + "learning_rate": 9.884315311496123e-07, + "logits/chosen": 0.5342029929161072, + "logits/rejected": 0.5386108160018921, + "logps/accuracies": 0.75, + "logps/chosen": -340.9605407714844, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -338.02606201171875, + "logps/ref_rejected": -346.3376770019531, + "logps/rejected": -377.7868347167969, + "loss": 0.5815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29344886541366577, + "rewards/grad_term": 0.020996563136577606, + "rewards/margins": 2.8514671325683594, + "rewards/rejected": -3.1449155807495117, + "step": 92 + }, + { + "epoch": 0.1929210423959549, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 25.326369311886236, + "learning_rate": 9.907947222134885e-07, + "logits/chosen": 0.4443345069885254, + "logits/rejected": 0.4642353653907776, + "logps/accuracies": 0.875, + "logps/chosen": -346.2325744628906, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -345.8390197753906, + "logps/ref_rejected": -357.72564697265625, + "logps/rejected": -413.6025085449219, + "loss": 0.5793, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03935527801513672, + "rewards/grad_term": 0.00771428644657135, + "rewards/margins": 5.548335552215576, + "rewards/rejected": -5.587691783905029, + "step": 93 + }, + { + "epoch": 0.19499546220666408, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 21.376114840937195, + "learning_rate": 9.931326380221604e-07, + "logits/chosen": 0.6561794281005859, + "logits/rejected": 0.7463537454605103, + "logps/accuracies": 0.8125, + "logps/chosen": -254.1697540283203, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -258.75799560546875, + "logps/ref_rejected": -282.442138671875, + "logps/rejected": -320.4227600097656, + "loss": 0.5967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4588264226913452, + "rewards/grad_term": 0.02241508476436138, + "rewards/margins": 4.256890296936035, + "rewards/rejected": -3.7980637550354004, + "step": 94 + }, + { + "epoch": 0.19706988201737327, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 28.87335565570023, + "learning_rate": 9.95445813516801e-07, + "logits/chosen": 0.31641554832458496, + "logits/rejected": 0.4115113914012909, + "logps/accuracies": 0.8125, + "logps/chosen": -305.4784240722656, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -301.1734619140625, + "logps/ref_rejected": -309.7505187988281, + "logps/rejected": -370.07855224609375, + "loss": 0.595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43049633502960205, + "rewards/grad_term": 0.007062141317874193, + "rewards/margins": 5.602307319641113, + "rewards/rejected": -6.032803535461426, + "step": 95 + }, + { + "epoch": 0.19914430182808246, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 30.52259082669545, + "learning_rate": 9.977347668335242e-07, + "logits/chosen": 0.5447170734405518, + "logits/rejected": 0.6960605978965759, + "logps/accuracies": 0.8125, + "logps/chosen": -320.6680603027344, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -323.8368225097656, + "logps/ref_rejected": -339.37957763671875, + "logps/rejected": -400.385009765625, + "loss": 0.6261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3168814182281494, + "rewards/grad_term": 0.006425461731851101, + "rewards/margins": 6.417423248291016, + "rewards/rejected": -6.100542068481445, + "step": 96 + }, + { + "epoch": 0.20121872163879165, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 51.93086544682398, + "learning_rate": 1e-06, + "logits/chosen": 0.6568098068237305, + "logits/rejected": 0.6733189225196838, + "logps/accuracies": 0.875, + "logps/chosen": -286.9490966796875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -284.9871826171875, + "logps/ref_rejected": -301.6272888183594, + "logps/rejected": -357.9549560546875, + "loss": 0.5764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19619154930114746, + "rewards/grad_term": 0.018322059884667397, + "rewards/margins": 5.436576843261719, + "rewards/rejected": -5.632768630981445, + "step": 97 + }, + { + "epoch": 0.20329314144950084, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 28.258232668290642, + "learning_rate": 1e-06, + "logits/chosen": 0.41130974888801575, + "logits/rejected": 0.47705498337745667, + "logps/accuracies": 0.75, + "logps/chosen": -322.64227294921875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -310.24725341796875, + "logps/ref_rejected": -321.1720275878906, + "logps/rejected": -375.3896484375, + "loss": 0.6226, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.239502191543579, + "rewards/grad_term": 0.022803550586104393, + "rewards/margins": 4.182260990142822, + "rewards/rejected": -5.4217634201049805, + "step": 98 + }, + { + "epoch": 0.20536756126021002, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 27.738502482797642, + "learning_rate": 9.988465974625143e-07, + "logits/chosen": 0.4429183602333069, + "logits/rejected": 0.5393229126930237, + "logps/accuracies": 0.75, + "logps/chosen": -272.3836975097656, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -278.6350402832031, + "logps/ref_rejected": -277.7386169433594, + "logps/rejected": -315.6932373046875, + "loss": 0.6369, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6251335740089417, + "rewards/grad_term": 0.01758977398276329, + "rewards/margins": 4.420593738555908, + "rewards/rejected": -3.7954602241516113, + "step": 99 + }, + { + "epoch": 0.20744198107091924, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 49.56019663548194, + "learning_rate": 9.976931949250289e-07, + "logits/chosen": 0.5111449956893921, + "logits/rejected": 0.4637998640537262, + "logps/accuracies": 0.8125, + "logps/chosen": -305.05950927734375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -303.1130065917969, + "logps/ref_rejected": -295.25433349609375, + "logps/rejected": -367.4417724609375, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19465157389640808, + "rewards/grad_term": 0.006312578916549683, + "rewards/margins": 7.024093151092529, + "rewards/rejected": -7.218744277954102, + "step": 100 + }, + { + "epoch": 0.20951640088162843, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 17.980236189059163, + "learning_rate": 9.965397923875432e-07, + "logits/chosen": 0.5828474760055542, + "logits/rejected": 0.6235547661781311, + "logps/accuracies": 0.8125, + "logps/chosen": -270.784912109375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -273.26043701171875, + "logps/ref_rejected": -269.11077880859375, + "logps/rejected": -325.9720458984375, + "loss": 0.6338, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24755248427391052, + "rewards/grad_term": 0.01740310713648796, + "rewards/margins": 5.933681488037109, + "rewards/rejected": -5.686128616333008, + "step": 101 + }, + { + "epoch": 0.21159082069233762, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 51.922995548635456, + "learning_rate": 9.953863898500576e-07, + "logits/chosen": 0.18250882625579834, + "logits/rejected": 0.20775896310806274, + "logps/accuracies": 0.8125, + "logps/chosen": -266.9601745605469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -265.5532531738281, + "logps/ref_rejected": -263.236328125, + "logps/rejected": -325.6028747558594, + "loss": 0.5857, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.14069411158561707, + "rewards/grad_term": 0.010405524633824825, + "rewards/margins": 6.095961570739746, + "rewards/rejected": -6.236655235290527, + "step": 102 + }, + { + "epoch": 0.2136652405030468, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 58.49656496183437, + "learning_rate": 9.94232987312572e-07, + "logits/chosen": 0.24150438606739044, + "logits/rejected": 0.23409827053546906, + "logps/accuracies": 0.6875, + "logps/chosen": -275.4272155761719, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -267.8497009277344, + "logps/ref_rejected": -259.2445068359375, + "logps/rejected": -302.8243103027344, + "loss": 0.654, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7577495574951172, + "rewards/grad_term": 0.027012387290596962, + "rewards/margins": 3.6002304553985596, + "rewards/rejected": -4.357979774475098, + "step": 103 + }, + { + "epoch": 0.215739660313756, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 33.69936760710596, + "learning_rate": 9.930795847750865e-07, + "logits/chosen": 0.37147602438926697, + "logits/rejected": 0.5065699219703674, + "logps/accuracies": 0.8125, + "logps/chosen": -246.92913818359375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -244.8133544921875, + "logps/ref_rejected": -273.55145263671875, + "logps/rejected": -323.93560791015625, + "loss": 0.5903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.211576446890831, + "rewards/grad_term": 0.01827353984117508, + "rewards/margins": 4.82683801651001, + "rewards/rejected": -5.038414001464844, + "step": 104 + }, + { + "epoch": 0.21781408012446518, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 30.373799785451364, + "learning_rate": 9.919261822376009e-07, + "logits/chosen": 0.650319516658783, + "logits/rejected": 0.6357383728027344, + "logps/accuracies": 0.6875, + "logps/chosen": -262.9386901855469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -259.9366760253906, + "logps/ref_rejected": -256.1930236816406, + "logps/rejected": -282.44952392578125, + "loss": 0.5851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3002018928527832, + "rewards/grad_term": 0.028288275003433228, + "rewards/margins": 2.3254497051239014, + "rewards/rejected": -2.6256518363952637, + "step": 105 + }, + { + "epoch": 0.21988849993517437, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 24.848196690851, + "learning_rate": 9.907727797001152e-07, + "logits/chosen": 0.35767537355422974, + "logits/rejected": 0.42828047275543213, + "logps/accuracies": 0.9375, + "logps/chosen": -260.8604736328125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -265.82379150390625, + "logps/ref_rejected": -287.0606689453125, + "logps/rejected": -353.9102478027344, + "loss": 0.5905, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4963342547416687, + "rewards/grad_term": 0.011764682829380035, + "rewards/margins": 7.18129301071167, + "rewards/rejected": -6.684958457946777, + "step": 106 + }, + { + "epoch": 0.22196291974588359, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 50.26567742873495, + "learning_rate": 9.896193771626296e-07, + "logits/chosen": 0.24374046921730042, + "logits/rejected": 0.2071159929037094, + "logps/accuracies": 0.8125, + "logps/chosen": -322.9122009277344, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -321.7509765625, + "logps/ref_rejected": -327.6671142578125, + "logps/rejected": -379.50433349609375, + "loss": 0.5947, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11612237989902496, + "rewards/grad_term": 0.013888241723179817, + "rewards/margins": 5.06759786605835, + "rewards/rejected": -5.183720588684082, + "step": 107 + }, + { + "epoch": 0.22403733955659277, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 24.60066728160171, + "learning_rate": 9.884659746251442e-07, + "logits/chosen": 0.28119832277297974, + "logits/rejected": 0.4410630464553833, + "logps/accuracies": 0.8125, + "logps/chosen": -263.49688720703125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -259.3692626953125, + "logps/ref_rejected": -296.3498229980469, + "logps/rejected": -338.8854064941406, + "loss": 0.6482, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41276171803474426, + "rewards/grad_term": 0.030529310926795006, + "rewards/margins": 3.840797185897827, + "rewards/rejected": -4.253559112548828, + "step": 108 + }, + { + "epoch": 0.22611175936730196, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 29.88832833016055, + "learning_rate": 9.873125720876585e-07, + "logits/chosen": 0.4925777018070221, + "logits/rejected": 0.39786702394485474, + "logps/accuracies": 0.625, + "logps/chosen": -288.4710693359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -291.87298583984375, + "logps/ref_rejected": -257.73553466796875, + "logps/rejected": -322.43603515625, + "loss": 0.6003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3401949405670166, + "rewards/grad_term": 0.005394600797444582, + "rewards/margins": 6.810248851776123, + "rewards/rejected": -6.4700541496276855, + "step": 109 + }, + { + "epoch": 0.22818617917801115, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 23.915753236950135, + "learning_rate": 9.861591695501729e-07, + "logits/chosen": 0.2205159217119217, + "logits/rejected": 0.19697824120521545, + "logps/accuracies": 0.8125, + "logps/chosen": -352.7537841796875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -355.77337646484375, + "logps/ref_rejected": -356.9278564453125, + "logps/rejected": -400.95245361328125, + "loss": 0.5934, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.30196261405944824, + "rewards/grad_term": 0.017759006470441818, + "rewards/margins": 4.704426288604736, + "rewards/rejected": -4.402463436126709, + "step": 110 + }, + { + "epoch": 0.23026059898872034, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 27.27299967581395, + "learning_rate": 9.850057670126874e-07, + "logits/chosen": 0.37821733951568604, + "logits/rejected": 0.4970583915710449, + "logps/accuracies": 0.75, + "logps/chosen": -237.38504028320312, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -230.9459228515625, + "logps/ref_rejected": -249.4907684326172, + "logps/rejected": -298.8011169433594, + "loss": 0.633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6439133286476135, + "rewards/grad_term": 0.018191155046224594, + "rewards/margins": 4.287120819091797, + "rewards/rejected": -4.931033134460449, + "step": 111 + }, + { + "epoch": 0.23233501879942953, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 39.145128913057, + "learning_rate": 9.838523644752018e-07, + "logits/chosen": 0.1512741595506668, + "logits/rejected": 0.32822132110595703, + "logps/accuracies": 0.75, + "logps/chosen": -267.9004821777344, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -264.21697998046875, + "logps/ref_rejected": -307.68572998046875, + "logps/rejected": -361.28033447265625, + "loss": 0.5966, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3683478534221649, + "rewards/grad_term": 0.017407521605491638, + "rewards/margins": 4.9911088943481445, + "rewards/rejected": -5.359456539154053, + "step": 112 + }, + { + "epoch": 0.23440943861013872, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 27.5954679159607, + "learning_rate": 9.826989619377162e-07, + "logits/chosen": 0.5426469445228577, + "logits/rejected": 0.5697547197341919, + "logps/accuracies": 0.75, + "logps/chosen": -312.826904296875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -306.6598815917969, + "logps/ref_rejected": -277.28387451171875, + "logps/rejected": -353.48114013671875, + "loss": 0.6009, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.616702675819397, + "rewards/grad_term": 0.01106889545917511, + "rewards/margins": 7.003021240234375, + "rewards/rejected": -7.619723320007324, + "step": 113 + }, + { + "epoch": 0.23648385842084793, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 40.98809530673688, + "learning_rate": 9.815455594002307e-07, + "logits/chosen": 0.39876848459243774, + "logits/rejected": 0.3462454378604889, + "logps/accuracies": 0.8125, + "logps/chosen": -294.205078125, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -285.1748046875, + "logps/ref_rejected": -286.44140625, + "logps/rejected": -345.4891662597656, + "loss": 0.6112, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9030314087867737, + "rewards/grad_term": 0.011187486350536346, + "rewards/margins": 5.001744747161865, + "rewards/rejected": -5.904776573181152, + "step": 114 + }, + { + "epoch": 0.23855827823155712, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 40.11356790900559, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.5221942067146301, + "logits/rejected": 0.4882541298866272, + "logps/accuracies": 0.8125, + "logps/chosen": -260.0171203613281, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -251.60545349121094, + "logps/ref_rejected": -259.59515380859375, + "logps/rejected": -315.2181701660156, + "loss": 0.6161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8411648869514465, + "rewards/grad_term": 0.021061977371573448, + "rewards/margins": 4.721133232116699, + "rewards/rejected": -5.56229829788208, + "step": 115 + }, + { + "epoch": 0.2406326980422663, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 31.171420280298065, + "learning_rate": 9.792387543252594e-07, + "logits/chosen": 0.23254762589931488, + "logits/rejected": 0.2675570845603943, + "logps/accuracies": 0.9375, + "logps/chosen": -289.489501953125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -289.00616455078125, + "logps/ref_rejected": -302.8209533691406, + "logps/rejected": -371.5626220703125, + "loss": 0.5818, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.048332199454307556, + "rewards/grad_term": 0.007328622043132782, + "rewards/margins": 6.825834274291992, + "rewards/rejected": -6.874166488647461, + "step": 116 + }, + { + "epoch": 0.2427071178529755, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 25.865643815270218, + "learning_rate": 9.780853517877738e-07, + "logits/chosen": 0.5108106136322021, + "logits/rejected": 0.5345089435577393, + "logps/accuracies": 0.875, + "logps/chosen": -284.67791748046875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -285.08941650390625, + "logps/ref_rejected": -308.15838623046875, + "logps/rejected": -370.12554931640625, + "loss": 0.5606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04114929586648941, + "rewards/grad_term": 0.009833071380853653, + "rewards/margins": 6.237868785858154, + "rewards/rejected": -6.196719646453857, + "step": 117 + }, + { + "epoch": 0.24478153766368468, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 23.22657914288563, + "learning_rate": 9.769319492502884e-07, + "logits/chosen": 0.23898278176784515, + "logits/rejected": 0.2838956415653229, + "logps/accuracies": 0.9375, + "logps/chosen": -317.9300231933594, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -322.7581787109375, + "logps/ref_rejected": -333.2860107421875, + "logps/rejected": -404.1823425292969, + "loss": 0.5433, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4828159809112549, + "rewards/grad_term": 0.002703046426177025, + "rewards/margins": 7.572445869445801, + "rewards/rejected": -7.089630126953125, + "step": 118 + }, + { + "epoch": 0.24685595747439387, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 20.20109483182801, + "learning_rate": 9.757785467128027e-07, + "logits/chosen": 0.6895065307617188, + "logits/rejected": 0.7345404624938965, + "logps/accuracies": 0.8125, + "logps/chosen": -298.1529846191406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -294.2225341796875, + "logps/ref_rejected": -282.6121520996094, + "logps/rejected": -337.44476318359375, + "loss": 0.5809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3930422067642212, + "rewards/grad_term": 0.017846597358584404, + "rewards/margins": 5.090217113494873, + "rewards/rejected": -5.483259677886963, + "step": 119 + }, + { + "epoch": 0.24893037728510306, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 33.31398261005609, + "learning_rate": 9.74625144175317e-07, + "logits/chosen": 0.37160423398017883, + "logits/rejected": 0.3335186839103699, + "logps/accuracies": 0.75, + "logps/chosen": -276.2862548828125, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -279.1397399902344, + "logps/ref_rejected": -279.9727478027344, + "logps/rejected": -327.0055847167969, + "loss": 0.62, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.28534770011901855, + "rewards/grad_term": 0.01826310157775879, + "rewards/margins": 4.9886322021484375, + "rewards/rejected": -4.70328426361084, + "step": 120 + }, + { + "epoch": 0.25100479709581225, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 24.34104219676861, + "learning_rate": 9.734717416378314e-07, + "logits/chosen": 0.47060269117355347, + "logits/rejected": 0.533828854560852, + "logps/accuracies": 0.5625, + "logps/chosen": -250.86029052734375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -249.06607055664062, + "logps/ref_rejected": -266.4825439453125, + "logps/rejected": -304.35400390625, + "loss": 0.6245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17942330241203308, + "rewards/grad_term": 0.023510945960879326, + "rewards/margins": 3.6077194213867188, + "rewards/rejected": -3.787142515182495, + "step": 121 + }, + { + "epoch": 0.25307921690652146, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 32.38649249036862, + "learning_rate": 9.72318339100346e-07, + "logits/chosen": 0.058825843036174774, + "logits/rejected": 0.1310182362794876, + "logps/accuracies": 0.625, + "logps/chosen": -307.8884582519531, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -298.2418212890625, + "logps/ref_rejected": -289.80157470703125, + "logps/rejected": -332.2846984863281, + "loss": 0.6421, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9646634459495544, + "rewards/grad_term": 0.028044363483786583, + "rewards/margins": 3.283651828765869, + "rewards/rejected": -4.248315334320068, + "step": 122 + }, + { + "epoch": 0.2551536367172306, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 25.328252823752003, + "learning_rate": 9.711649365628604e-07, + "logits/chosen": 0.4595690667629242, + "logits/rejected": 0.4828678071498871, + "logps/accuracies": 0.8125, + "logps/chosen": -319.6042785644531, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -314.05908203125, + "logps/ref_rejected": -309.8699645996094, + "logps/rejected": -373.5873718261719, + "loss": 0.638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5545214414596558, + "rewards/grad_term": 0.0088451923802495, + "rewards/margins": 5.817216396331787, + "rewards/rejected": -6.371737480163574, + "step": 123 + }, + { + "epoch": 0.25722805652793984, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 58.14409414314697, + "learning_rate": 9.70011534025375e-07, + "logits/chosen": 0.16532814502716064, + "logits/rejected": 0.1864890158176422, + "logps/accuracies": 0.6875, + "logps/chosen": -322.5909729003906, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -328.1080017089844, + "logps/ref_rejected": -314.7974548339844, + "logps/rejected": -369.16583251953125, + "loss": 0.6333, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.551704466342926, + "rewards/grad_term": 0.014467663131654263, + "rewards/margins": 5.9885406494140625, + "rewards/rejected": -5.4368367195129395, + "step": 124 + }, + { + "epoch": 0.25930247633864906, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 29.577490597996505, + "learning_rate": 9.688581314878893e-07, + "logits/chosen": 0.27106067538261414, + "logits/rejected": 0.28159230947494507, + "logps/accuracies": 0.875, + "logps/chosen": -324.951904296875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -315.4916687011719, + "logps/ref_rejected": -310.50274658203125, + "logps/rejected": -384.7645568847656, + "loss": 0.6228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9460303783416748, + "rewards/grad_term": 0.01042198482900858, + "rewards/margins": 6.480146884918213, + "rewards/rejected": -7.426177024841309, + "step": 125 + }, + { + "epoch": 0.2613768961493582, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 28.275175168662855, + "learning_rate": 9.677047289504036e-07, + "logits/chosen": 0.16482499241828918, + "logits/rejected": 0.13334128260612488, + "logps/accuracies": 0.8125, + "logps/chosen": -395.91998291015625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -403.248046875, + "logps/ref_rejected": -382.96343994140625, + "logps/rejected": -467.2958679199219, + "loss": 0.586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7328065633773804, + "rewards/grad_term": 0.002520698821172118, + "rewards/margins": 9.166044235229492, + "rewards/rejected": -8.43323802947998, + "step": 126 + }, + { + "epoch": 0.26345131596006743, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 31.309734628094976, + "learning_rate": 9.66551326412918e-07, + "logits/chosen": 0.07952776551246643, + "logits/rejected": 0.1613186150789261, + "logps/accuracies": 0.875, + "logps/chosen": -320.70037841796875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -327.677490234375, + "logps/ref_rejected": -339.91748046875, + "logps/rejected": -401.8999328613281, + "loss": 0.6011, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6977108120918274, + "rewards/grad_term": 0.010999541729688644, + "rewards/margins": 6.89595890045166, + "rewards/rejected": -6.198247909545898, + "step": 127 + }, + { + "epoch": 0.2655257357707766, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 31.741083253136384, + "learning_rate": 9.653979238754326e-07, + "logits/chosen": 0.36669662594795227, + "logits/rejected": 0.40633296966552734, + "logps/accuracies": 0.8125, + "logps/chosen": -352.07159423828125, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -343.3804016113281, + "logps/ref_rejected": -353.6275939941406, + "logps/rejected": -414.04425048828125, + "loss": 0.6335, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8691204190254211, + "rewards/grad_term": 0.015956774353981018, + "rewards/margins": 5.172546863555908, + "rewards/rejected": -6.0416669845581055, + "step": 128 + }, + { + "epoch": 0.2655257357707766, + "eval_flips/correct->correct": 0.4236453175544739, + "eval_flips/correct->incorrect": 0.019704433158040047, + "eval_flips/incorrect->correct": 0.3300492465496063, + "eval_flips/incorrect->incorrect": 0.2266009896993637, + "eval_logits/chosen": 0.3016127645969391, + "eval_logits/rejected": 0.34773820638656616, + "eval_logps/accuracies": 0.7536945939064026, + "eval_logps/chosen": -294.51837158203125, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -349.0025329589844, + "eval_loss": 0.6313375234603882, + "eval_rewards/accuracies": 0.8866994976997375, + "eval_rewards/chosen": -0.7167255878448486, + "eval_rewards/grad_term": 0.016497639939188957, + "eval_rewards/margins": 5.278923511505127, + "eval_rewards/rejected": -5.995649337768555, + "eval_runtime": 785.8607, + "eval_samples_per_second": 2.059, + "eval_steps_per_second": 0.258, + "step": 128 + }, + { + "epoch": 0.2676001555814858, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 35.58302765361098, + "learning_rate": 9.64244521337947e-07, + "logits/chosen": 0.3390696048736572, + "logits/rejected": 0.3560726046562195, + "logps/accuracies": 0.8125, + "logps/chosen": -322.2768249511719, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -310.5375061035156, + "logps/ref_rejected": -317.2485046386719, + "logps/rejected": -383.12200927734375, + "loss": 0.6209, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1739336252212524, + "rewards/grad_term": 0.013801316730678082, + "rewards/margins": 5.413419246673584, + "rewards/rejected": -6.5873517990112305, + "step": 129 + }, + { + "epoch": 0.26967457539219497, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 29.188493789977155, + "learning_rate": 9.630911188004613e-07, + "logits/chosen": 0.4089130163192749, + "logits/rejected": 0.3992210626602173, + "logps/accuracies": 0.625, + "logps/chosen": -246.3241729736328, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -250.2421417236328, + "logps/ref_rejected": -233.44342041015625, + "logps/rejected": -285.1191711425781, + "loss": 0.6077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.391795814037323, + "rewards/grad_term": 0.014157270081341267, + "rewards/margins": 5.559370040893555, + "rewards/rejected": -5.167574405670166, + "step": 130 + }, + { + "epoch": 0.2717489952029042, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 36.02900591013536, + "learning_rate": 9.619377162629756e-07, + "logits/chosen": 0.32642504572868347, + "logits/rejected": 0.34259384870529175, + "logps/accuracies": 0.875, + "logps/chosen": -331.6784973144531, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -329.0844421386719, + "logps/ref_rejected": -341.3873596191406, + "logps/rejected": -407.9965515136719, + "loss": 0.632, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2594057619571686, + "rewards/grad_term": 0.006255139596760273, + "rewards/margins": 6.4015092849731445, + "rewards/rejected": -6.660915374755859, + "step": 131 + }, + { + "epoch": 0.2738234150136134, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 24.659869882606102, + "learning_rate": 9.607843137254902e-07, + "logits/chosen": 0.25219637155532837, + "logits/rejected": 0.21175454556941986, + "logps/accuracies": 0.6875, + "logps/chosen": -320.4070129394531, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -320.45111083984375, + "logps/ref_rejected": -291.7745056152344, + "logps/rejected": -371.3065490722656, + "loss": 0.5938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0044078826904296875, + "rewards/grad_term": 0.007969305850565434, + "rewards/margins": 7.957607269287109, + "rewards/rejected": -7.953199863433838, + "step": 132 + }, + { + "epoch": 0.27589783482432256, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 66.69910712869152, + "learning_rate": 9.596309111880046e-07, + "logits/chosen": 0.4336986243724823, + "logits/rejected": 0.4323787987232208, + "logps/accuracies": 0.6875, + "logps/chosen": -302.4508361816406, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -285.8231201171875, + "logps/ref_rejected": -284.0436706542969, + "logps/rejected": -342.44122314453125, + "loss": 0.6008, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6627693176269531, + "rewards/grad_term": 0.020465871319174767, + "rewards/margins": 4.176986217498779, + "rewards/rejected": -5.839755535125732, + "step": 133 + }, + { + "epoch": 0.2779722546350318, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 35.23241127542349, + "learning_rate": 9.58477508650519e-07, + "logits/chosen": 0.502811074256897, + "logits/rejected": 0.5239925980567932, + "logps/accuracies": 0.75, + "logps/chosen": -317.7173156738281, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -317.0647888183594, + "logps/ref_rejected": -297.7698669433594, + "logps/rejected": -358.817626953125, + "loss": 0.6362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06524688005447388, + "rewards/grad_term": 0.008810807019472122, + "rewards/margins": 6.039529800415039, + "rewards/rejected": -6.1047773361206055, + "step": 134 + }, + { + "epoch": 0.28004667444574094, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 58.595789612615945, + "learning_rate": 9.573241061130333e-07, + "logits/chosen": 0.36474430561065674, + "logits/rejected": 0.35197287797927856, + "logps/accuracies": 0.75, + "logps/chosen": -326.8201904296875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -325.84783935546875, + "logps/ref_rejected": -326.2251892089844, + "logps/rejected": -372.5774841308594, + "loss": 0.6008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09723645448684692, + "rewards/grad_term": 0.020627174526453018, + "rewards/margins": 4.537996768951416, + "rewards/rejected": -4.6352338790893555, + "step": 135 + }, + { + "epoch": 0.28212109425645016, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 25.14456310024547, + "learning_rate": 9.561707035755479e-07, + "logits/chosen": 0.26421457529067993, + "logits/rejected": 0.33900099992752075, + "logps/accuracies": 0.875, + "logps/chosen": -260.8269348144531, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -258.1986999511719, + "logps/ref_rejected": -283.28680419921875, + "logps/rejected": -322.0076904296875, + "loss": 0.6207, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26282617449760437, + "rewards/grad_term": 0.02707597427070141, + "rewards/margins": 3.609261989593506, + "rewards/rejected": -3.8720884323120117, + "step": 136 + }, + { + "epoch": 0.2841955140671593, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 42.8867860973982, + "learning_rate": 9.550173010380622e-07, + "logits/chosen": 0.09971302002668381, + "logits/rejected": 0.12542912364006042, + "logps/accuracies": 0.5625, + "logps/chosen": -322.331787109375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -321.5882873535156, + "logps/ref_rejected": -315.2976379394531, + "logps/rejected": -352.12066650390625, + "loss": 0.6771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07434892654418945, + "rewards/grad_term": 0.01898660883307457, + "rewards/margins": 3.607954978942871, + "rewards/rejected": -3.6823039054870605, + "step": 137 + }, + { + "epoch": 0.28626993387786853, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 67.6595700226082, + "learning_rate": 9.538638985005768e-07, + "logits/chosen": 0.23438116908073425, + "logits/rejected": 0.3342619240283966, + "logps/accuracies": 0.8125, + "logps/chosen": -300.3291015625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -296.4096984863281, + "logps/ref_rejected": -310.85064697265625, + "logps/rejected": -363.2022399902344, + "loss": 0.6381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39193806052207947, + "rewards/grad_term": 0.017197635024785995, + "rewards/margins": 4.84321928024292, + "rewards/rejected": -5.235157489776611, + "step": 138 + }, + { + "epoch": 0.28834435368857775, + "flips/correct->correct": 0.1875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 46.44298957827346, + "learning_rate": 9.52710495963091e-07, + "logits/chosen": 0.1467391550540924, + "logits/rejected": 0.11830101907253265, + "logps/accuracies": 0.625, + "logps/chosen": -321.1262512207031, + "logps/ref_accuracies": 0.1875, + "logps/ref_chosen": -325.08428955078125, + "logps/ref_rejected": -281.5567321777344, + "logps/rejected": -340.51849365234375, + "loss": 0.6354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3958081305027008, + "rewards/grad_term": 0.013231747783720493, + "rewards/margins": 6.291983604431152, + "rewards/rejected": -5.896175384521484, + "step": 139 + }, + { + "epoch": 0.2904187734992869, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 36.89267412980749, + "learning_rate": 9.515570934256055e-07, + "logits/chosen": 0.3458084762096405, + "logits/rejected": 0.37101224064826965, + "logps/accuracies": 0.875, + "logps/chosen": -278.9690246582031, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -275.2301940917969, + "logps/ref_rejected": -293.14935302734375, + "logps/rejected": -343.7322998046875, + "loss": 0.6516, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.37388384342193604, + "rewards/grad_term": 0.020328430458903313, + "rewards/margins": 4.684409141540527, + "rewards/rejected": -5.058292865753174, + "step": 140 + }, + { + "epoch": 0.2924931933099961, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 29.520183041657447, + "learning_rate": 9.504036908881198e-07, + "logits/chosen": 0.2588088810443878, + "logits/rejected": 0.35400643944740295, + "logps/accuracies": 0.6875, + "logps/chosen": -329.9474182128906, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -309.6986999511719, + "logps/ref_rejected": -333.968994140625, + "logps/rejected": -376.42169189453125, + "loss": 0.6512, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.024869918823242, + "rewards/grad_term": 0.03422696888446808, + "rewards/margins": 2.220407247543335, + "rewards/rejected": -4.24527645111084, + "step": 141 + }, + { + "epoch": 0.2945676131207053, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 20.090341392606742, + "learning_rate": 9.492502883506344e-07, + "logits/chosen": 0.17465892434120178, + "logits/rejected": 0.19804833829402924, + "logps/accuracies": 0.6875, + "logps/chosen": -313.5783386230469, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -307.25164794921875, + "logps/ref_rejected": -290.28948974609375, + "logps/rejected": -353.5581359863281, + "loss": 0.5813, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6326678991317749, + "rewards/grad_term": 0.017216186970472336, + "rewards/margins": 5.694197177886963, + "rewards/rejected": -6.3268656730651855, + "step": 142 + }, + { + "epoch": 0.2966420329314145, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 38.2122061812, + "learning_rate": 9.480968858131488e-07, + "logits/chosen": 0.28784969449043274, + "logits/rejected": 0.38434553146362305, + "logps/accuracies": 0.8125, + "logps/chosen": -337.89697265625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -307.4181823730469, + "logps/ref_rejected": -347.0847473144531, + "logps/rejected": -420.6956787109375, + "loss": 0.5784, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.047877788543701, + "rewards/grad_term": 0.021118801087141037, + "rewards/margins": 4.3132147789001465, + "rewards/rejected": -7.361092567443848, + "step": 143 + }, + { + "epoch": 0.29871645274212366, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 37.993499093491, + "learning_rate": 9.469434832756632e-07, + "logits/chosen": 0.3818073570728302, + "logits/rejected": 0.4472813010215759, + "logps/accuracies": 1.0, + "logps/chosen": -309.6822509765625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -295.15576171875, + "logps/ref_rejected": -316.4786071777344, + "logps/rejected": -395.452392578125, + "loss": 0.6367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4526524543762207, + "rewards/grad_term": 0.005610838998109102, + "rewards/margins": 6.4447221755981445, + "rewards/rejected": -7.897374629974365, + "step": 144 + }, + { + "epoch": 0.3007908725528329, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 54.21342872299693, + "learning_rate": 9.457900807381776e-07, + "logits/chosen": 0.10580252856016159, + "logits/rejected": 0.1295485496520996, + "logps/accuracies": 0.875, + "logps/chosen": -312.27178955078125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -306.3640441894531, + "logps/ref_rejected": -344.1884765625, + "logps/rejected": -413.30572509765625, + "loss": 0.5971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5907725095748901, + "rewards/grad_term": 0.014945675618946552, + "rewards/margins": 6.320951461791992, + "rewards/rejected": -6.911723613739014, + "step": 145 + }, + { + "epoch": 0.3028652923635421, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 49.62895603709547, + "learning_rate": 9.446366782006921e-07, + "logits/chosen": 0.496852844953537, + "logits/rejected": 0.49739354848861694, + "logps/accuracies": 0.75, + "logps/chosen": -254.4556121826172, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -253.5081787109375, + "logps/ref_rejected": -243.6640625, + "logps/rejected": -306.6009826660156, + "loss": 0.6307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09474316239356995, + "rewards/grad_term": 0.014840014278888702, + "rewards/margins": 6.198947906494141, + "rewards/rejected": -6.2936906814575195, + "step": 146 + }, + { + "epoch": 0.30493971217425125, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 33.16438726233068, + "learning_rate": 9.434832756632064e-07, + "logits/chosen": 0.3227022588253021, + "logits/rejected": 0.29622456431388855, + "logps/accuracies": 0.9375, + "logps/chosen": -315.11383056640625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -323.7686462402344, + "logps/ref_rejected": -318.28631591796875, + "logps/rejected": -397.4606628417969, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.865482747554779, + "rewards/grad_term": 0.0047454568557441235, + "rewards/margins": 8.782920837402344, + "rewards/rejected": -7.917438507080078, + "step": 147 + }, + { + "epoch": 0.30701413198496047, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 32.98244501476046, + "learning_rate": 9.423298731257209e-07, + "logits/chosen": 0.26352736353874207, + "logits/rejected": 0.2875834107398987, + "logps/accuracies": 0.6875, + "logps/chosen": -268.8025817871094, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -260.4774169921875, + "logps/ref_rejected": -263.6733703613281, + "logps/rejected": -314.69268798828125, + "loss": 0.6464, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8325148820877075, + "rewards/grad_term": 0.02254444733262062, + "rewards/margins": 4.269417762756348, + "rewards/rejected": -5.101933002471924, + "step": 148 + }, + { + "epoch": 0.30908855179566963, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 33.32776229173226, + "learning_rate": 9.411764705882352e-07, + "logits/chosen": 0.11156149208545685, + "logits/rejected": 0.24737051129341125, + "logps/accuracies": 0.75, + "logps/chosen": -303.48388671875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -298.459228515625, + "logps/ref_rejected": -314.2269592285156, + "logps/rejected": -370.69622802734375, + "loss": 0.5583, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5024658441543579, + "rewards/grad_term": 0.022268792614340782, + "rewards/margins": 5.144461631774902, + "rewards/rejected": -5.646927356719971, + "step": 149 + }, + { + "epoch": 0.31116297160637885, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 21.82859690680754, + "learning_rate": 9.400230680507497e-07, + "logits/chosen": 0.29909923672676086, + "logits/rejected": 0.33298757672309875, + "logps/accuracies": 0.625, + "logps/chosen": -245.38111877441406, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -240.87034606933594, + "logps/ref_rejected": -240.8376922607422, + "logps/rejected": -289.25067138671875, + "loss": 0.5837, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.45107802748680115, + "rewards/grad_term": 0.022424593567848206, + "rewards/margins": 4.390218257904053, + "rewards/rejected": -4.841296195983887, + "step": 150 + }, + { + "epoch": 0.313237391417088, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 32.35074274110825, + "learning_rate": 9.38869665513264e-07, + "logits/chosen": 0.1612250953912735, + "logits/rejected": 0.15677325427532196, + "logps/accuracies": 0.8125, + "logps/chosen": -291.77081298828125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -292.1325988769531, + "logps/ref_rejected": -287.9341735839844, + "logps/rejected": -358.65057373046875, + "loss": 0.6135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03617708384990692, + "rewards/grad_term": 0.011941466480493546, + "rewards/margins": 7.107817649841309, + "rewards/rejected": -7.071640968322754, + "step": 151 + }, + { + "epoch": 0.3153118112277972, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 42.437628926037625, + "learning_rate": 9.377162629757785e-07, + "logits/chosen": 0.21432383358478546, + "logits/rejected": 0.2382117211818695, + "logps/accuracies": 0.9375, + "logps/chosen": -270.55712890625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -275.3125305175781, + "logps/ref_rejected": -276.75384521484375, + "logps/rejected": -355.29779052734375, + "loss": 0.5223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4755399823188782, + "rewards/grad_term": 0.004575583152472973, + "rewards/margins": 8.329938888549805, + "rewards/rejected": -7.854398250579834, + "step": 152 + }, + { + "epoch": 0.31738623103850644, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 41.66344734527698, + "learning_rate": 9.365628604382929e-07, + "logits/chosen": -0.07189223915338516, + "logits/rejected": -0.08959042280912399, + "logps/accuracies": 0.875, + "logps/chosen": -328.5618896484375, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -327.5036926269531, + "logps/ref_rejected": -321.0348205566406, + "logps/rejected": -393.354248046875, + "loss": 0.5755, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10582125186920166, + "rewards/grad_term": 0.014492910355329514, + "rewards/margins": 7.126119613647461, + "rewards/rejected": -7.231941223144531, + "step": 153 + }, + { + "epoch": 0.3194606508492156, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 29.74794689137638, + "learning_rate": 9.354094579008073e-07, + "logits/chosen": 0.35974666476249695, + "logits/rejected": 0.3688337206840515, + "logps/accuracies": 0.8125, + "logps/chosen": -313.2078857421875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -305.1917419433594, + "logps/ref_rejected": -320.38128662109375, + "logps/rejected": -390.50933837890625, + "loss": 0.5928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8016154766082764, + "rewards/grad_term": 0.008705828338861465, + "rewards/margins": 6.2111945152282715, + "rewards/rejected": -7.012809753417969, + "step": 154 + }, + { + "epoch": 0.3215350706599248, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 28.687378270489237, + "learning_rate": 9.342560553633218e-07, + "logits/chosen": 0.18269102275371552, + "logits/rejected": 0.1776474416255951, + "logps/accuracies": 0.875, + "logps/chosen": -275.4834289550781, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -272.5428771972656, + "logps/ref_rejected": -261.2928161621094, + "logps/rejected": -336.526611328125, + "loss": 0.5897, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2940564453601837, + "rewards/grad_term": 0.006705356761813164, + "rewards/margins": 7.2293267250061035, + "rewards/rejected": -7.523382186889648, + "step": 155 + }, + { + "epoch": 0.323609490470634, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 36.335486366302646, + "learning_rate": 9.331026528258363e-07, + "logits/chosen": 0.07444247603416443, + "logits/rejected": 0.20133280754089355, + "logps/accuracies": 0.75, + "logps/chosen": -329.2906188964844, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -316.47515869140625, + "logps/ref_rejected": -332.9421081542969, + "logps/rejected": -394.68560791015625, + "loss": 0.5576, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2815489768981934, + "rewards/grad_term": 0.0174331646412611, + "rewards/margins": 4.892797946929932, + "rewards/rejected": -6.174346923828125, + "step": 156 + }, + { + "epoch": 0.3256839102813432, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 27.29416495658281, + "learning_rate": 9.319492502883506e-07, + "logits/chosen": 0.4256312847137451, + "logits/rejected": 0.4740726053714752, + "logps/accuracies": 0.9375, + "logps/chosen": -321.6173095703125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -303.39617919921875, + "logps/ref_rejected": -309.72113037109375, + "logps/rejected": -375.21429443359375, + "loss": 0.5708, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.822115182876587, + "rewards/grad_term": 0.014289310202002525, + "rewards/margins": 4.727199077606201, + "rewards/rejected": -6.549314022064209, + "step": 157 + }, + { + "epoch": 0.32775833009205235, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 32.223889847251904, + "learning_rate": 9.307958477508651e-07, + "logits/chosen": 0.3213425576686859, + "logits/rejected": 0.35512280464172363, + "logps/accuracies": 0.8125, + "logps/chosen": -299.30364990234375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -305.61181640625, + "logps/ref_rejected": -306.4347839355469, + "logps/rejected": -362.4326171875, + "loss": 0.573, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6308162212371826, + "rewards/grad_term": 0.008797680027782917, + "rewards/margins": 6.230600357055664, + "rewards/rejected": -5.5997843742370605, + "step": 158 + }, + { + "epoch": 0.32983274990276157, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 42.213863271098234, + "learning_rate": 9.296424452133794e-07, + "logits/chosen": 0.377382755279541, + "logits/rejected": 0.456988126039505, + "logps/accuracies": 1.0, + "logps/chosen": -288.3867492675781, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -293.0536804199219, + "logps/ref_rejected": -317.74163818359375, + "logps/rejected": -391.38153076171875, + "loss": 0.546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4666934609413147, + "rewards/grad_term": 0.0034745843149721622, + "rewards/margins": 7.8306803703308105, + "rewards/rejected": -7.363986968994141, + "step": 159 + }, + { + "epoch": 0.3319071697134708, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 71.4121450161779, + "learning_rate": 9.284890426758939e-07, + "logits/chosen": 0.31267380714416504, + "logits/rejected": 0.33250027894973755, + "logps/accuracies": 0.75, + "logps/chosen": -327.6630554199219, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -322.4392395019531, + "logps/ref_rejected": -329.0008544921875, + "logps/rejected": -386.5987548828125, + "loss": 0.6503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5223828554153442, + "rewards/grad_term": 0.015317104756832123, + "rewards/margins": 5.23740291595459, + "rewards/rejected": -5.759785175323486, + "step": 160 + }, + { + "epoch": 0.33398158952417994, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 90.86602072589874, + "learning_rate": 9.273356401384083e-07, + "logits/chosen": 0.16347447037696838, + "logits/rejected": 0.2435542643070221, + "logps/accuracies": 0.8125, + "logps/chosen": -309.9219665527344, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -309.3226013183594, + "logps/ref_rejected": -317.44964599609375, + "logps/rejected": -393.472900390625, + "loss": 0.5695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05993741750717163, + "rewards/grad_term": 0.009122053161263466, + "rewards/margins": 7.542388916015625, + "rewards/rejected": -7.602326393127441, + "step": 161 + }, + { + "epoch": 0.33605600933488916, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 63.38287747184822, + "learning_rate": 9.261822376009227e-07, + "logits/chosen": 0.03160097077488899, + "logits/rejected": 0.15727761387825012, + "logps/accuracies": 0.75, + "logps/chosen": -304.5788269042969, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -295.0198669433594, + "logps/ref_rejected": -315.7266540527344, + "logps/rejected": -362.4599304199219, + "loss": 0.669, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9558972716331482, + "rewards/grad_term": 0.02149307169020176, + "rewards/margins": 3.7174317836761475, + "rewards/rejected": -4.673328876495361, + "step": 162 + }, + { + "epoch": 0.3381304291455983, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 37.24982276954982, + "learning_rate": 9.250288350634371e-07, + "logits/chosen": 0.24977634847164154, + "logits/rejected": 0.2465619146823883, + "logps/accuracies": 0.875, + "logps/chosen": -300.94000244140625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -296.2314453125, + "logps/ref_rejected": -290.3169250488281, + "logps/rejected": -364.783447265625, + "loss": 0.5658, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.47085797786712646, + "rewards/grad_term": 0.00792708620429039, + "rewards/margins": 6.97579288482666, + "rewards/rejected": -7.446650981903076, + "step": 163 + }, + { + "epoch": 0.34020484895630754, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 41.17406855028247, + "learning_rate": 9.238754325259515e-07, + "logits/chosen": 0.27946552634239197, + "logits/rejected": 0.2818312346935272, + "logps/accuracies": 0.75, + "logps/chosen": -334.7640075683594, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -326.3661804199219, + "logps/ref_rejected": -322.5880126953125, + "logps/rejected": -379.8989562988281, + "loss": 0.5503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8397865295410156, + "rewards/grad_term": 0.01920832134783268, + "rewards/margins": 4.89130973815918, + "rewards/rejected": -5.731095790863037, + "step": 164 + }, + { + "epoch": 0.3422792687670167, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 30.140573016986096, + "learning_rate": 9.227220299884659e-07, + "logits/chosen": 0.106672503054142, + "logits/rejected": 0.20751769840717316, + "logps/accuracies": 0.75, + "logps/chosen": -289.3790283203125, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -279.317138671875, + "logps/ref_rejected": -285.1666564941406, + "logps/rejected": -346.164794921875, + "loss": 0.5963, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0061873197555542, + "rewards/grad_term": 0.0124615877866745, + "rewards/margins": 5.093625068664551, + "rewards/rejected": -6.099812030792236, + "step": 165 + }, + { + "epoch": 0.3443536885777259, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 33.13491495817512, + "learning_rate": 9.215686274509803e-07, + "logits/chosen": 0.49837812781333923, + "logits/rejected": 0.5220686793327332, + "logps/accuracies": 0.8125, + "logps/chosen": -289.36871337890625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -290.96624755859375, + "logps/ref_rejected": -274.1662292480469, + "logps/rejected": -336.1552734375, + "loss": 0.5496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15975357592105865, + "rewards/grad_term": 0.012641198933124542, + "rewards/margins": 6.358658790588379, + "rewards/rejected": -6.1989054679870605, + "step": 166 + }, + { + "epoch": 0.34642810838843513, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 54.11732274527683, + "learning_rate": 9.204152249134947e-07, + "logits/chosen": 0.015494227409362793, + "logits/rejected": 0.016678210347890854, + "logps/accuracies": 0.8125, + "logps/chosen": -325.68914794921875, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -331.82000732421875, + "logps/ref_rejected": -319.7809143066406, + "logps/rejected": -382.5654296875, + "loss": 0.5618, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6130860447883606, + "rewards/grad_term": 0.009663441218435764, + "rewards/margins": 6.891541004180908, + "rewards/rejected": -6.278454780578613, + "step": 167 + }, + { + "epoch": 0.3485025281991443, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 35.17938428087587, + "learning_rate": 9.192618223760092e-07, + "logits/chosen": 0.1498590111732483, + "logits/rejected": 0.03451567143201828, + "logps/accuracies": 0.75, + "logps/chosen": -318.3073425292969, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -324.92156982421875, + "logps/ref_rejected": -279.1431884765625, + "logps/rejected": -343.2935791015625, + "loss": 0.5567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6614212989807129, + "rewards/grad_term": 0.004435483831912279, + "rewards/margins": 7.076463222503662, + "rewards/rejected": -6.415041923522949, + "step": 168 + }, + { + "epoch": 0.3505769480098535, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 29.79744425397875, + "learning_rate": 9.181084198385236e-07, + "logits/chosen": 0.4352983832359314, + "logits/rejected": 0.42166027426719666, + "logps/accuracies": 0.5625, + "logps/chosen": -218.41848754882812, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -218.35841369628906, + "logps/ref_rejected": -203.54812622070312, + "logps/rejected": -244.30548095703125, + "loss": 0.6235, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0060057491064071655, + "rewards/grad_term": 0.020719772204756737, + "rewards/margins": 4.069727897644043, + "rewards/rejected": -4.0757341384887695, + "step": 169 + }, + { + "epoch": 0.35265136782056267, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 37.443318352494714, + "learning_rate": 9.16955017301038e-07, + "logits/chosen": 0.34023189544677734, + "logits/rejected": 0.36414065957069397, + "logps/accuracies": 0.5625, + "logps/chosen": -341.10162353515625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -348.6609191894531, + "logps/ref_rejected": -313.69305419921875, + "logps/rejected": -365.22552490234375, + "loss": 0.5449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7559297680854797, + "rewards/grad_term": 0.012427425011992455, + "rewards/margins": 5.909174919128418, + "rewards/rejected": -5.153245449066162, + "step": 170 + }, + { + "epoch": 0.3547257876312719, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 38.974124690109655, + "learning_rate": 9.158016147635525e-07, + "logits/chosen": 0.2296074777841568, + "logits/rejected": 0.22281108796596527, + "logps/accuracies": 0.75, + "logps/chosen": -266.399658203125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -266.3810729980469, + "logps/ref_rejected": -264.22479248046875, + "logps/rejected": -312.05328369140625, + "loss": 0.5639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00185690401121974, + "rewards/grad_term": 0.021189574152231216, + "rewards/margins": 4.780992031097412, + "rewards/rejected": -4.782848358154297, + "step": 171 + }, + { + "epoch": 0.35680020744198104, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 40.615225671584604, + "learning_rate": 9.146482122260668e-07, + "logits/chosen": -0.058992840349674225, + "logits/rejected": 0.13406533002853394, + "logps/accuracies": 0.8125, + "logps/chosen": -248.48712158203125, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -245.46426391601562, + "logps/ref_rejected": -322.3562316894531, + "logps/rejected": -359.47808837890625, + "loss": 0.6297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3022858798503876, + "rewards/grad_term": 0.02431631274521351, + "rewards/margins": 3.4098992347717285, + "rewards/rejected": -3.7121849060058594, + "step": 172 + }, + { + "epoch": 0.35887462725269026, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 48.38748944944743, + "learning_rate": 9.134948096885813e-07, + "logits/chosen": 0.17907698452472687, + "logits/rejected": 0.2532532811164856, + "logps/accuracies": 0.75, + "logps/chosen": -268.450927734375, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -272.5251770019531, + "logps/ref_rejected": -259.0370178222656, + "logps/rejected": -306.13800048828125, + "loss": 0.6268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.40742844343185425, + "rewards/grad_term": 0.01490036677569151, + "rewards/margins": 5.117522716522217, + "rewards/rejected": -4.710094451904297, + "step": 173 + }, + { + "epoch": 0.3609490470633995, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 31.921038664510473, + "learning_rate": 9.123414071510956e-07, + "logits/chosen": 0.24931451678276062, + "logits/rejected": 0.32089927792549133, + "logps/accuracies": 0.875, + "logps/chosen": -327.949951171875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -325.4226379394531, + "logps/ref_rejected": -319.7037353515625, + "logps/rejected": -401.667724609375, + "loss": 0.5455, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25273361802101135, + "rewards/grad_term": 0.007721267640590668, + "rewards/margins": 7.943665504455566, + "rewards/rejected": -8.196398735046387, + "step": 174 + }, + { + "epoch": 0.36302346687410864, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 32.597119731399815, + "learning_rate": 9.111880046136101e-07, + "logits/chosen": 0.14227242767810822, + "logits/rejected": 0.14696185290813446, + "logps/accuracies": 0.9375, + "logps/chosen": -302.6449890136719, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -304.1968078613281, + "logps/ref_rejected": -312.4093322753906, + "logps/rejected": -365.310791015625, + "loss": 0.5941, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15517938137054443, + "rewards/grad_term": 0.017512062564492226, + "rewards/margins": 5.4453277587890625, + "rewards/rejected": -5.2901482582092285, + "step": 175 + }, + { + "epoch": 0.36509788668481785, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 40.38991325463123, + "learning_rate": 9.100346020761245e-07, + "logits/chosen": 0.4459385275840759, + "logits/rejected": 0.48317578434944153, + "logps/accuracies": 0.9375, + "logps/chosen": -375.50152587890625, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -361.5687255859375, + "logps/ref_rejected": -390.73028564453125, + "logps/rejected": -472.79315185546875, + "loss": 0.6096, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3932788372039795, + "rewards/grad_term": 0.012904556468129158, + "rewards/margins": 6.81300163269043, + "rewards/rejected": -8.206280708312988, + "step": 176 + }, + { + "epoch": 0.367172306495527, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 40.714548146552936, + "learning_rate": 9.088811995386389e-07, + "logits/chosen": 0.1182754784822464, + "logits/rejected": 0.10860362648963928, + "logps/accuracies": 0.8125, + "logps/chosen": -277.8997802734375, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -265.0371398925781, + "logps/ref_rejected": -266.34906005859375, + "logps/rejected": -348.8620300292969, + "loss": 0.5457, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.286261796951294, + "rewards/grad_term": 0.013155965134501457, + "rewards/margins": 6.965037822723389, + "rewards/rejected": -8.251298904418945, + "step": 177 + }, + { + "epoch": 0.36924672630623623, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 23.925847924727726, + "learning_rate": 9.077277970011533e-07, + "logits/chosen": 0.19493117928504944, + "logits/rejected": 0.1747354418039322, + "logps/accuracies": 0.6875, + "logps/chosen": -253.36224365234375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -253.14541625976562, + "logps/ref_rejected": -253.6729736328125, + "logps/rejected": -327.2242736816406, + "loss": 0.5883, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02168332040309906, + "rewards/grad_term": 0.010116681456565857, + "rewards/margins": 7.33344841003418, + "rewards/rejected": -7.35513162612915, + "step": 178 + }, + { + "epoch": 0.37132114611694544, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 41.54057148552365, + "learning_rate": 9.065743944636677e-07, + "logits/chosen": 0.09981651604175568, + "logits/rejected": 0.060021985322237015, + "logps/accuracies": 0.8125, + "logps/chosen": -319.4670104980469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -305.79840087890625, + "logps/ref_rejected": -288.304443359375, + "logps/rejected": -342.7301940917969, + "loss": 0.6653, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3668627738952637, + "rewards/grad_term": 0.01719023287296295, + "rewards/margins": 4.075715065002441, + "rewards/rejected": -5.442577838897705, + "step": 179 + }, + { + "epoch": 0.3733955659276546, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 53.53086030452953, + "learning_rate": 9.054209919261822e-07, + "logits/chosen": 0.32655516266822815, + "logits/rejected": 0.4233202338218689, + "logps/accuracies": 0.75, + "logps/chosen": -231.95205688476562, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -222.77748107910156, + "logps/ref_rejected": -249.9750213623047, + "logps/rejected": -298.7811584472656, + "loss": 0.616, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9174575805664062, + "rewards/grad_term": 0.03101710043847561, + "rewards/margins": 3.9631576538085938, + "rewards/rejected": -4.880615234375, + "step": 180 + }, + { + "epoch": 0.3754699857383638, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 32.474177233879054, + "learning_rate": 9.042675893886967e-07, + "logits/chosen": 0.15671122074127197, + "logits/rejected": 0.15824642777442932, + "logps/accuracies": 0.875, + "logps/chosen": -331.437255859375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -329.1837158203125, + "logps/ref_rejected": -332.14324951171875, + "logps/rejected": -419.30963134765625, + "loss": 0.585, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2253548800945282, + "rewards/grad_term": 0.006346164736896753, + "rewards/margins": 8.491281509399414, + "rewards/rejected": -8.716635704040527, + "step": 181 + }, + { + "epoch": 0.377544405549073, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 32.49282958416918, + "learning_rate": 9.03114186851211e-07, + "logits/chosen": 0.18260034918785095, + "logits/rejected": 0.14178498089313507, + "logps/accuracies": 0.8125, + "logps/chosen": -295.148193359375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -288.6500244140625, + "logps/ref_rejected": -278.32867431640625, + "logps/rejected": -359.5013427734375, + "loss": 0.5916, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6498188972473145, + "rewards/grad_term": 0.008202875964343548, + "rewards/margins": 7.467443466186523, + "rewards/rejected": -8.11726188659668, + "step": 182 + }, + { + "epoch": 0.3796188253597822, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 29.4740035475264, + "learning_rate": 9.019607843137255e-07, + "logits/chosen": 0.26660820841789246, + "logits/rejected": 0.36798760294914246, + "logps/accuracies": 0.75, + "logps/chosen": -304.508056640625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -300.42181396484375, + "logps/ref_rejected": -271.1709899902344, + "logps/rejected": -343.62274169921875, + "loss": 0.5723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40862417221069336, + "rewards/grad_term": 0.009963629767298698, + "rewards/margins": 6.836550712585449, + "rewards/rejected": -7.245175361633301, + "step": 183 + }, + { + "epoch": 0.38169324517049136, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 39.92370508801508, + "learning_rate": 9.008073817762398e-07, + "logits/chosen": 0.11014918982982635, + "logits/rejected": 0.12970136106014252, + "logps/accuracies": 0.875, + "logps/chosen": -311.4981689453125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -316.7832336425781, + "logps/ref_rejected": -304.4905700683594, + "logps/rejected": -379.767333984375, + "loss": 0.5724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5285077095031738, + "rewards/grad_term": 0.0013775170082226396, + "rewards/margins": 8.056180953979492, + "rewards/rejected": -7.527673721313477, + "step": 184 + }, + { + "epoch": 0.3837676649812006, + "flips/correct->correct": 0.1875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 77.26727734044364, + "learning_rate": 8.996539792387543e-07, + "logits/chosen": 0.32915353775024414, + "logits/rejected": 0.36334753036499023, + "logps/accuracies": 0.625, + "logps/chosen": -317.26904296875, + "logps/ref_accuracies": 0.1875, + "logps/ref_chosen": -323.752685546875, + "logps/ref_rejected": -283.16058349609375, + "logps/rejected": -342.1800537109375, + "loss": 0.5702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6483669281005859, + "rewards/grad_term": 0.011216258630156517, + "rewards/margins": 6.550315856933594, + "rewards/rejected": -5.901949405670166, + "step": 185 + }, + { + "epoch": 0.3858420847919098, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 44.46292399532884, + "learning_rate": 8.985005767012687e-07, + "logits/chosen": 0.250750333070755, + "logits/rejected": 0.2685026228427887, + "logps/accuracies": 0.625, + "logps/chosen": -308.99078369140625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -318.26324462890625, + "logps/ref_rejected": -362.1750793457031, + "logps/rejected": -410.8619384765625, + "loss": 0.6081, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9272449612617493, + "rewards/grad_term": 0.01049741543829441, + "rewards/margins": 5.795932769775391, + "rewards/rejected": -4.868687152862549, + "step": 186 + }, + { + "epoch": 0.38791650460261895, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 60.58056206089815, + "learning_rate": 8.973471741637831e-07, + "logits/chosen": 0.16933155059814453, + "logits/rejected": 0.23612166941165924, + "logps/accuracies": 0.8125, + "logps/chosen": -347.870849609375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -355.09906005859375, + "logps/ref_rejected": -377.2695007324219, + "logps/rejected": -426.260009765625, + "loss": 0.6342, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7228207588195801, + "rewards/grad_term": 0.016279596835374832, + "rewards/margins": 5.6218695640563965, + "rewards/rejected": -4.899049282073975, + "step": 187 + }, + { + "epoch": 0.38999092441332817, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 38.184653723145104, + "learning_rate": 8.961937716262975e-07, + "logits/chosen": 0.19383230805397034, + "logits/rejected": 0.27799373865127563, + "logps/accuracies": 0.875, + "logps/chosen": -237.73406982421875, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -243.88479614257812, + "logps/ref_rejected": -270.0097961425781, + "logps/rejected": -308.8341064453125, + "loss": 0.6516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6150741577148438, + "rewards/grad_term": 0.016990307718515396, + "rewards/margins": 4.497503280639648, + "rewards/rejected": -3.8824288845062256, + "step": 188 + }, + { + "epoch": 0.3920653442240373, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 94.03754868556166, + "learning_rate": 8.95040369088812e-07, + "logits/chosen": 0.18277229368686676, + "logits/rejected": 0.32301703095436096, + "logps/accuracies": 0.75, + "logps/chosen": -297.812744140625, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -297.7760314941406, + "logps/ref_rejected": -351.8489990234375, + "logps/rejected": -395.95452880859375, + "loss": 0.5906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.003670990467071533, + "rewards/grad_term": 0.01534755527973175, + "rewards/margins": 4.406883716583252, + "rewards/rejected": -4.4105544090271, + "step": 189 + }, + { + "epoch": 0.39413976403474654, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 36.637550653396346, + "learning_rate": 8.938869665513263e-07, + "logits/chosen": 0.3476504683494568, + "logits/rejected": 0.33958810567855835, + "logps/accuracies": 0.8125, + "logps/chosen": -227.98374938964844, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -220.88394165039062, + "logps/ref_rejected": -224.73675537109375, + "logps/rejected": -278.4228515625, + "loss": 0.6248, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7099814414978027, + "rewards/grad_term": 0.017782405018806458, + "rewards/margins": 4.658628463745117, + "rewards/rejected": -5.36860990524292, + "step": 190 + }, + { + "epoch": 0.3962141838454557, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 34.6180474435158, + "learning_rate": 8.927335640138408e-07, + "logits/chosen": 0.29524171352386475, + "logits/rejected": 0.249376118183136, + "logps/accuracies": 0.75, + "logps/chosen": -340.69873046875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -323.4943542480469, + "logps/ref_rejected": -289.3078308105469, + "logps/rejected": -365.9657287597656, + "loss": 0.5815, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7204382419586182, + "rewards/grad_term": 0.016098525375127792, + "rewards/margins": 5.945353031158447, + "rewards/rejected": -7.6657915115356445, + "step": 191 + }, + { + "epoch": 0.3982886036561649, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 38.65115109290824, + "learning_rate": 8.915801614763551e-07, + "logits/chosen": 0.10618914663791656, + "logits/rejected": 0.20010565221309662, + "logps/accuracies": 0.8125, + "logps/chosen": -255.34237670898438, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -254.41799926757812, + "logps/ref_rejected": -255.06790161132812, + "logps/rejected": -335.4033203125, + "loss": 0.5252, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09243983030319214, + "rewards/grad_term": 0.008064459078013897, + "rewards/margins": 7.941101551055908, + "rewards/rejected": -8.033540725708008, + "step": 192 + }, + { + "epoch": 0.3982886036561649, + "eval_flips/correct->correct": 0.4334975481033325, + "eval_flips/correct->incorrect": 0.009852216579020023, + "eval_flips/incorrect->correct": 0.35960590839385986, + "eval_flips/incorrect->incorrect": 0.19704432785511017, + "eval_logits/chosen": 0.20541736483573914, + "eval_logits/rejected": 0.25030994415283203, + "eval_logps/accuracies": 0.7931034564971924, + "eval_logps/chosen": -291.723388671875, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -350.8753356933594, + "eval_loss": 0.6111010313034058, + "eval_rewards/accuracies": 0.8620689511299133, + "eval_rewards/chosen": -0.4372285008430481, + "eval_rewards/grad_term": 0.016007939353585243, + "eval_rewards/margins": 5.745702743530273, + "eval_rewards/rejected": -6.182931900024414, + "eval_runtime": 791.2188, + "eval_samples_per_second": 2.045, + "eval_steps_per_second": 0.257, + "step": 192 + }, + { + "epoch": 0.40036302346687414, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 29.785884187336258, + "learning_rate": 8.904267589388697e-07, + "logits/chosen": 0.4344290494918823, + "logits/rejected": 0.4853968620300293, + "logps/accuracies": 0.75, + "logps/chosen": -235.33804321289062, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -224.7119903564453, + "logps/ref_rejected": -247.52615356445312, + "logps/rejected": -317.22064208984375, + "loss": 0.5751, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0626037120819092, + "rewards/grad_term": 0.017159339040517807, + "rewards/margins": 5.906847953796387, + "rewards/rejected": -6.969451904296875, + "step": 193 + }, + { + "epoch": 0.4024374432775833, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 33.823622203078585, + "learning_rate": 8.89273356401384e-07, + "logits/chosen": -0.00733010470867157, + "logits/rejected": 0.039806053042411804, + "logps/accuracies": 0.8125, + "logps/chosen": -298.87469482421875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -278.5909423828125, + "logps/ref_rejected": -274.5445251464844, + "logps/rejected": -354.7998352050781, + "loss": 0.6322, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.028369426727295, + "rewards/grad_term": 0.014807065948843956, + "rewards/margins": 5.997160911560059, + "rewards/rejected": -8.025529861450195, + "step": 194 + }, + { + "epoch": 0.4045118630882925, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 71.82502886035329, + "learning_rate": 8.881199538638985e-07, + "logits/chosen": 0.06032078340649605, + "logits/rejected": 0.08065234869718552, + "logps/accuracies": 0.8125, + "logps/chosen": -305.660888671875, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -292.9607238769531, + "logps/ref_rejected": -284.1155090332031, + "logps/rejected": -348.9861755371094, + "loss": 0.6122, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2700166702270508, + "rewards/grad_term": 0.022815629839897156, + "rewards/margins": 5.2170515060424805, + "rewards/rejected": -6.487068176269531, + "step": 195 + }, + { + "epoch": 0.40658628289900167, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 47.96509016237826, + "learning_rate": 8.869665513264129e-07, + "logits/chosen": 0.5451265573501587, + "logits/rejected": 0.6818545460700989, + "logps/accuracies": 0.8125, + "logps/chosen": -262.90087890625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -249.34771728515625, + "logps/ref_rejected": -265.0631408691406, + "logps/rejected": -335.938232421875, + "loss": 0.5831, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3553152084350586, + "rewards/grad_term": 0.012808618135750294, + "rewards/margins": 5.732193946838379, + "rewards/rejected": -7.087508201599121, + "step": 196 + }, + { + "epoch": 0.4086607027097109, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 21.697323150316873, + "learning_rate": 8.858131487889273e-07, + "logits/chosen": 0.11935015022754669, + "logits/rejected": 0.2006658911705017, + "logps/accuracies": 0.875, + "logps/chosen": -336.0437927246094, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -329.3843078613281, + "logps/ref_rejected": -327.40045166015625, + "logps/rejected": -413.8304748535156, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6659499406814575, + "rewards/grad_term": 0.00346172577701509, + "rewards/margins": 7.977048873901367, + "rewards/rejected": -8.642998695373535, + "step": 197 + }, + { + "epoch": 0.41073512252042005, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 40.358779912668254, + "learning_rate": 8.846597462514417e-07, + "logits/chosen": 0.23720747232437134, + "logits/rejected": 0.2754895091056824, + "logps/accuracies": 0.875, + "logps/chosen": -296.7972106933594, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -265.8367614746094, + "logps/ref_rejected": -281.0025939941406, + "logps/rejected": -351.4774475097656, + "loss": 0.625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0960447788238525, + "rewards/grad_term": 0.024012045934796333, + "rewards/margins": 3.9514381885528564, + "rewards/rejected": -7.047482967376709, + "step": 198 + }, + { + "epoch": 0.41280954233112926, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 92.31220771810881, + "learning_rate": 8.835063437139562e-07, + "logits/chosen": 0.4791252911090851, + "logits/rejected": 0.5575248599052429, + "logps/accuracies": 0.8125, + "logps/chosen": -268.31036376953125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -276.9249572753906, + "logps/ref_rejected": -266.7868957519531, + "logps/rejected": -348.34381103515625, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.861458420753479, + "rewards/grad_term": 0.004701174795627594, + "rewards/margins": 9.0171537399292, + "rewards/rejected": -8.155694961547852, + "step": 199 + }, + { + "epoch": 0.4148839621418385, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 45.77620831393305, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": 0.12551181018352509, + "logits/rejected": 0.14135059714317322, + "logps/accuracies": 0.9375, + "logps/chosen": -315.8223571777344, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -310.73651123046875, + "logps/ref_rejected": -292.3979797363281, + "logps/rejected": -374.2222900390625, + "loss": 0.6131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5085856318473816, + "rewards/grad_term": 0.004745251964777708, + "rewards/margins": 7.673846244812012, + "rewards/rejected": -8.182432174682617, + "step": 200 + }, + { + "epoch": 0.41695838195254764, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 40.635916417828575, + "learning_rate": 8.81199538638985e-07, + "logits/chosen": 0.06241011992096901, + "logits/rejected": 0.11283601820468903, + "logps/accuracies": 0.875, + "logps/chosen": -293.8592834472656, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -296.7002258300781, + "logps/ref_rejected": -303.21331787109375, + "logps/rejected": -367.14044189453125, + "loss": 0.5767, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.28409475088119507, + "rewards/grad_term": 0.00995566789060831, + "rewards/margins": 6.67680549621582, + "rewards/rejected": -6.392710208892822, + "step": 201 + }, + { + "epoch": 0.41903280176325686, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 31.417005116540565, + "learning_rate": 8.800461361014993e-07, + "logits/chosen": 0.08436602354049683, + "logits/rejected": 0.06602154672145844, + "logps/accuracies": 0.625, + "logps/chosen": -324.9912109375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -327.7398681640625, + "logps/ref_rejected": -292.74981689453125, + "logps/rejected": -361.5688171386719, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2748683989048004, + "rewards/grad_term": 0.007309483364224434, + "rewards/margins": 7.1567702293396, + "rewards/rejected": -6.88190221786499, + "step": 202 + }, + { + "epoch": 0.421107221573966, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 35.05900906163333, + "learning_rate": 8.788927335640138e-07, + "logits/chosen": 0.20211604237556458, + "logits/rejected": 0.21109752357006073, + "logps/accuracies": 0.8125, + "logps/chosen": -333.8841552734375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -335.36810302734375, + "logps/ref_rejected": -322.5255126953125, + "logps/rejected": -367.34234619140625, + "loss": 0.5631, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.14839425683021545, + "rewards/grad_term": 0.019571855664253235, + "rewards/margins": 4.630078315734863, + "rewards/rejected": -4.481683731079102, + "step": 203 + }, + { + "epoch": 0.42318164138467523, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 32.259107415010234, + "learning_rate": 8.777393310265282e-07, + "logits/chosen": 0.27740195393562317, + "logits/rejected": 0.37459149956703186, + "logps/accuracies": 0.8125, + "logps/chosen": -259.053466796875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -246.73666381835938, + "logps/ref_rejected": -271.6988525390625, + "logps/rejected": -328.5376281738281, + "loss": 0.6065, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2316789627075195, + "rewards/grad_term": 0.0200329702347517, + "rewards/margins": 4.452197551727295, + "rewards/rejected": -5.6838765144348145, + "step": 204 + }, + { + "epoch": 0.4252560611953844, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 32.608539229670114, + "learning_rate": 8.765859284890427e-07, + "logits/chosen": 0.16907714307308197, + "logits/rejected": 0.20513200759887695, + "logps/accuracies": 0.875, + "logps/chosen": -244.8258056640625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -226.30160522460938, + "logps/ref_rejected": -258.55389404296875, + "logps/rejected": -322.78662109375, + "loss": 0.5794, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8524205684661865, + "rewards/grad_term": 0.019479090347886086, + "rewards/margins": 4.570858001708984, + "rewards/rejected": -6.42327880859375, + "step": 205 + }, + { + "epoch": 0.4273304810060936, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 42.93969277035671, + "learning_rate": 8.754325259515571e-07, + "logits/chosen": 0.19578887522220612, + "logits/rejected": 0.24128146469593048, + "logps/accuracies": 0.8125, + "logps/chosen": -270.4194030761719, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -261.4314270019531, + "logps/ref_rejected": -278.7731628417969, + "logps/rejected": -349.106201171875, + "loss": 0.5817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8987985849380493, + "rewards/grad_term": 0.01616433635354042, + "rewards/margins": 6.134509086608887, + "rewards/rejected": -7.0333075523376465, + "step": 206 + }, + { + "epoch": 0.4294049008168028, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 40.26953156215366, + "learning_rate": 8.742791234140715e-07, + "logits/chosen": 0.2927509844303131, + "logits/rejected": 0.4132809340953827, + "logps/accuracies": 0.75, + "logps/chosen": -298.0523986816406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -286.35516357421875, + "logps/ref_rejected": -299.4516296386719, + "logps/rejected": -371.3787841796875, + "loss": 0.5984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.169724702835083, + "rewards/grad_term": 0.015548234805464745, + "rewards/margins": 6.022989749908447, + "rewards/rejected": -7.192714214324951, + "step": 207 + }, + { + "epoch": 0.431479320627512, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 44.42958011244542, + "learning_rate": 8.731257208765859e-07, + "logits/chosen": 0.15052379667758942, + "logits/rejected": 0.13466718792915344, + "logps/accuracies": 0.75, + "logps/chosen": -344.1072692871094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -348.62744140625, + "logps/ref_rejected": -322.577880859375, + "logps/rejected": -371.2545471191406, + "loss": 0.5599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45202189683914185, + "rewards/grad_term": 0.014142685569822788, + "rewards/margins": 5.319693565368652, + "rewards/rejected": -4.867671966552734, + "step": 208 + }, + { + "epoch": 0.4335537404382212, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 55.08631188728783, + "learning_rate": 8.719723183391004e-07, + "logits/chosen": 0.2708834409713745, + "logits/rejected": 0.3263266980648041, + "logps/accuracies": 0.875, + "logps/chosen": -266.4635009765625, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -266.25146484375, + "logps/ref_rejected": -271.44537353515625, + "logps/rejected": -334.1143798828125, + "loss": 0.5873, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.021202266216278076, + "rewards/grad_term": 0.010078574530780315, + "rewards/margins": 6.245699882507324, + "rewards/rejected": -6.266901969909668, + "step": 209 + }, + { + "epoch": 0.43562816024893036, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 42.60799619320707, + "learning_rate": 8.708189158016147e-07, + "logits/chosen": 0.3283449113368988, + "logits/rejected": 0.3115319013595581, + "logps/accuracies": 0.875, + "logps/chosen": -309.02783203125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -306.033935546875, + "logps/ref_rejected": -304.4499816894531, + "logps/rejected": -374.966064453125, + "loss": 0.5595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2993917167186737, + "rewards/grad_term": 0.011198869906365871, + "rewards/margins": 6.752218246459961, + "rewards/rejected": -7.051610469818115, + "step": 210 + }, + { + "epoch": 0.4377025800596396, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 74.62504339808369, + "learning_rate": 8.696655132641292e-07, + "logits/chosen": 0.04295940697193146, + "logits/rejected": 0.1364721655845642, + "logps/accuracies": 0.75, + "logps/chosen": -253.16937255859375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -249.6319580078125, + "logps/ref_rejected": -297.3905029296875, + "logps/rejected": -369.6263427734375, + "loss": 0.5717, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3537403643131256, + "rewards/grad_term": 0.011611053720116615, + "rewards/margins": 6.869847297668457, + "rewards/rejected": -7.223587989807129, + "step": 211 + }, + { + "epoch": 0.43977699987034874, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.125, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 42.048425807585254, + "learning_rate": 8.685121107266435e-07, + "logits/chosen": 0.2111242413520813, + "logits/rejected": 0.26101890206336975, + "logps/accuracies": 0.625, + "logps/chosen": -333.13372802734375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -298.3570556640625, + "logps/ref_rejected": -285.5286865234375, + "logps/rejected": -340.77008056640625, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4776673316955566, + "rewards/grad_term": 0.029041055589914322, + "rewards/margins": 2.046469211578369, + "rewards/rejected": -5.524137020111084, + "step": 212 + }, + { + "epoch": 0.44185141968105796, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 30.70565260692096, + "learning_rate": 8.67358708189158e-07, + "logits/chosen": 0.24488027393817902, + "logits/rejected": 0.3360682725906372, + "logps/accuracies": 0.75, + "logps/chosen": -306.7230529785156, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -302.4961242675781, + "logps/ref_rejected": -303.74755859375, + "logps/rejected": -375.0382080078125, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4226952791213989, + "rewards/grad_term": 0.006055990234017372, + "rewards/margins": 6.706371784210205, + "rewards/rejected": -7.1290669441223145, + "step": 213 + }, + { + "epoch": 0.44392583949176717, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 58.908823582347665, + "learning_rate": 8.662053056516724e-07, + "logits/chosen": 0.2656205892562866, + "logits/rejected": 0.29711484909057617, + "logps/accuracies": 0.8125, + "logps/chosen": -269.3460693359375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -255.48500061035156, + "logps/ref_rejected": -259.42156982421875, + "logps/rejected": -320.4120178222656, + "loss": 0.5785, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3861079216003418, + "rewards/grad_term": 0.017718670889735222, + "rewards/margins": 4.71293830871582, + "rewards/rejected": -6.099046230316162, + "step": 214 + }, + { + "epoch": 0.44600025930247633, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 56.61591688580251, + "learning_rate": 8.650519031141868e-07, + "logits/chosen": 0.35960566997528076, + "logits/rejected": 0.3392384648323059, + "logps/accuracies": 0.8125, + "logps/chosen": -299.29644775390625, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -293.4299011230469, + "logps/ref_rejected": -299.168701171875, + "logps/rejected": -368.6275939941406, + "loss": 0.5517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5866526961326599, + "rewards/grad_term": 0.011029000394046307, + "rewards/margins": 6.359241008758545, + "rewards/rejected": -6.94589376449585, + "step": 215 + }, + { + "epoch": 0.44807467911318555, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 35.14556178232318, + "learning_rate": 8.638985005767012e-07, + "logits/chosen": 0.11206863820552826, + "logits/rejected": 0.19429105520248413, + "logps/accuracies": 0.875, + "logps/chosen": -280.0439453125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -273.8160705566406, + "logps/ref_rejected": -276.4722595214844, + "logps/rejected": -337.29913330078125, + "loss": 0.6022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6227847933769226, + "rewards/grad_term": 0.008503232151269913, + "rewards/margins": 5.459905624389648, + "rewards/rejected": -6.082690715789795, + "step": 216 + }, + { + "epoch": 0.4501490989238947, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 36.20806400718805, + "learning_rate": 8.627450980392156e-07, + "logits/chosen": -0.1600431501865387, + "logits/rejected": -0.10942815244197845, + "logps/accuracies": 0.875, + "logps/chosen": -301.800048828125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -299.7960205078125, + "logps/ref_rejected": -275.73529052734375, + "logps/rejected": -364.4363098144531, + "loss": 0.5609, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20040588080883026, + "rewards/grad_term": 0.004247845150530338, + "rewards/margins": 8.669699668884277, + "rewards/rejected": -8.870105743408203, + "step": 217 + }, + { + "epoch": 0.4522235187346039, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.125, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 45.446157769285975, + "learning_rate": 8.615916955017301e-07, + "logits/chosen": 0.24637356400489807, + "logits/rejected": 0.284047931432724, + "logps/accuracies": 0.625, + "logps/chosen": -249.30892944335938, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -239.27142333984375, + "logps/ref_rejected": -260.403076171875, + "logps/rejected": -308.0331726074219, + "loss": 0.6314, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0037511587142944, + "rewards/grad_term": 0.02335098199546337, + "rewards/margins": 3.759258508682251, + "rewards/rejected": -4.763010025024414, + "step": 218 + }, + { + "epoch": 0.4542979385453131, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 30.72090456802453, + "learning_rate": 8.604382929642446e-07, + "logits/chosen": 0.30798035860061646, + "logits/rejected": 0.3721332848072052, + "logps/accuracies": 0.875, + "logps/chosen": -270.7388000488281, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -256.83319091796875, + "logps/ref_rejected": -253.60997009277344, + "logps/rejected": -320.3992004394531, + "loss": 0.5356, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3905625343322754, + "rewards/grad_term": 0.02029426395893097, + "rewards/margins": 5.288358688354492, + "rewards/rejected": -6.678920745849609, + "step": 219 + }, + { + "epoch": 0.4563723583560223, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 51.870714437465466, + "learning_rate": 8.592848904267589e-07, + "logits/chosen": 0.03684063255786896, + "logits/rejected": 0.16017459332942963, + "logps/accuracies": 0.75, + "logps/chosen": -253.1938934326172, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -256.5488586425781, + "logps/ref_rejected": -286.39129638671875, + "logps/rejected": -353.81689453125, + "loss": 0.5563, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3354969322681427, + "rewards/grad_term": 0.007589938119053841, + "rewards/margins": 7.078057765960693, + "rewards/rejected": -6.742560386657715, + "step": 220 + }, + { + "epoch": 0.4584467781667315, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.625, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 35.42772905894603, + "learning_rate": 8.581314878892734e-07, + "logits/chosen": 0.2873913049697876, + "logits/rejected": 0.26383453607559204, + "logps/accuracies": 0.875, + "logps/chosen": -360.1090087890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -353.25018310546875, + "logps/ref_rejected": -333.6820068359375, + "logps/rejected": -418.5933532714844, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6858816146850586, + "rewards/grad_term": 0.0032467113342136145, + "rewards/margins": 7.8052544593811035, + "rewards/rejected": -8.491135597229004, + "step": 221 + }, + { + "epoch": 0.4605211979774407, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 84.34900463866556, + "learning_rate": 8.569780853517877e-07, + "logits/chosen": 0.1874726116657257, + "logits/rejected": 0.2251831591129303, + "logps/accuracies": 0.875, + "logps/chosen": -264.17974853515625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -255.59912109375, + "logps/ref_rejected": -261.2677001953125, + "logps/rejected": -335.05792236328125, + "loss": 0.6218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.858064591884613, + "rewards/grad_term": 0.008338917046785355, + "rewards/margins": 6.520959377288818, + "rewards/rejected": -7.379024028778076, + "step": 222 + }, + { + "epoch": 0.4625956177881499, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 27.170194985611396, + "learning_rate": 8.558246828143022e-07, + "logits/chosen": 0.26863163709640503, + "logits/rejected": 0.2670744061470032, + "logps/accuracies": 0.875, + "logps/chosen": -277.2497863769531, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -275.13916015625, + "logps/ref_rejected": -272.11383056640625, + "logps/rejected": -328.0150146484375, + "loss": 0.5319, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21106398105621338, + "rewards/grad_term": 0.012204117141664028, + "rewards/margins": 5.379053115844727, + "rewards/rejected": -5.590117931365967, + "step": 223 + }, + { + "epoch": 0.46467003759885905, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 79.27462179097967, + "learning_rate": 8.546712802768166e-07, + "logits/chosen": 0.18783439695835114, + "logits/rejected": 0.19196242094039917, + "logps/accuracies": 0.875, + "logps/chosen": -347.10906982421875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -339.8036193847656, + "logps/ref_rejected": -325.4422912597656, + "logps/rejected": -407.8504333496094, + "loss": 0.6027, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7305458784103394, + "rewards/grad_term": 0.007087053265422583, + "rewards/margins": 7.510266304016113, + "rewards/rejected": -8.240811347961426, + "step": 224 + }, + { + "epoch": 0.46674445740956827, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 35.29759314617677, + "learning_rate": 8.53517877739331e-07, + "logits/chosen": -0.19355978071689606, + "logits/rejected": -0.016655761748552322, + "logps/accuracies": 0.6875, + "logps/chosen": -308.5700378417969, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -295.40545654296875, + "logps/ref_rejected": -343.7320251464844, + "logps/rejected": -400.0100402832031, + "loss": 0.5589, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3164560794830322, + "rewards/grad_term": 0.02172619104385376, + "rewards/margins": 4.311344146728516, + "rewards/rejected": -5.627799987792969, + "step": 225 + }, + { + "epoch": 0.46881887722027743, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 61.84351353540357, + "learning_rate": 8.523644752018454e-07, + "logits/chosen": 0.20059531927108765, + "logits/rejected": 0.1843167543411255, + "logps/accuracies": 0.75, + "logps/chosen": -261.94525146484375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -262.80419921875, + "logps/ref_rejected": -244.80892944335938, + "logps/rejected": -298.7880554199219, + "loss": 0.5996, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08589661121368408, + "rewards/grad_term": 0.018526069819927216, + "rewards/margins": 5.4838104248046875, + "rewards/rejected": -5.397914409637451, + "step": 226 + }, + { + "epoch": 0.47089329703098665, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 26.516761004669306, + "learning_rate": 8.512110726643598e-07, + "logits/chosen": 0.08376497030258179, + "logits/rejected": 0.1468810886144638, + "logps/accuracies": 0.9375, + "logps/chosen": -255.428466796875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -253.26193237304688, + "logps/ref_rejected": -286.236572265625, + "logps/rejected": -325.91131591796875, + "loss": 0.6258, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.21665439009666443, + "rewards/grad_term": 0.02595067396759987, + "rewards/margins": 3.75081729888916, + "rewards/rejected": -3.9674713611602783, + "step": 227 + }, + { + "epoch": 0.47296771684169586, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 33.01960140111705, + "learning_rate": 8.500576701268742e-07, + "logits/chosen": 0.20910394191741943, + "logits/rejected": 0.21387630701065063, + "logps/accuracies": 0.8125, + "logps/chosen": -304.03131103515625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -293.5747985839844, + "logps/ref_rejected": -301.56024169921875, + "logps/rejected": -366.596435546875, + "loss": 0.5344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0456496477127075, + "rewards/grad_term": 0.01662050373852253, + "rewards/margins": 5.4579668045043945, + "rewards/rejected": -6.5036163330078125, + "step": 228 + }, + { + "epoch": 0.475042136652405, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 29.434714310320842, + "learning_rate": 8.489042675893887e-07, + "logits/chosen": 0.15450771152973175, + "logits/rejected": 0.19930016994476318, + "logps/accuracies": 0.8125, + "logps/chosen": -294.2052917480469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -294.3157653808594, + "logps/ref_rejected": -289.5907897949219, + "logps/rejected": -365.57635498046875, + "loss": 0.5452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011044904589653015, + "rewards/grad_term": 0.0027202588971704245, + "rewards/margins": 7.609601974487305, + "rewards/rejected": -7.598557949066162, + "step": 229 + }, + { + "epoch": 0.47711655646311424, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 38.213053143905036, + "learning_rate": 8.477508650519031e-07, + "logits/chosen": 0.2588901221752167, + "logits/rejected": 0.44516509771347046, + "logps/accuracies": 1.0, + "logps/chosen": -291.5958251953125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -285.3277282714844, + "logps/ref_rejected": -350.38787841796875, + "logps/rejected": -426.9537353515625, + "loss": 0.5646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6268075704574585, + "rewards/grad_term": 0.007204078137874603, + "rewards/margins": 7.029778480529785, + "rewards/rejected": -7.656586170196533, + "step": 230 + }, + { + "epoch": 0.4791909762738234, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 25.865551411840645, + "learning_rate": 8.465974625144176e-07, + "logits/chosen": 0.2644757926464081, + "logits/rejected": 0.29192623496055603, + "logps/accuracies": 0.875, + "logps/chosen": -313.8361511230469, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -314.16973876953125, + "logps/ref_rejected": -297.2124938964844, + "logps/rejected": -363.6337890625, + "loss": 0.5635, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03335915505886078, + "rewards/grad_term": 0.01406506821513176, + "rewards/margins": 6.675488471984863, + "rewards/rejected": -6.642129421234131, + "step": 231 + }, + { + "epoch": 0.4812653960845326, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 20.4318067473308, + "learning_rate": 8.454440599769319e-07, + "logits/chosen": 0.18658952414989471, + "logits/rejected": 0.25278547406196594, + "logps/accuracies": 0.875, + "logps/chosen": -266.6859436035156, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -272.0683288574219, + "logps/ref_rejected": -291.2773132324219, + "logps/rejected": -373.94342041015625, + "loss": 0.5679, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5382405519485474, + "rewards/grad_term": 0.002569821197539568, + "rewards/margins": 8.804851531982422, + "rewards/rejected": -8.266611099243164, + "step": 232 + }, + { + "epoch": 0.4833398158952418, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 26.814576411742852, + "learning_rate": 8.442906574394463e-07, + "logits/chosen": 0.22701847553253174, + "logits/rejected": 0.5475070476531982, + "logps/accuracies": 0.875, + "logps/chosen": -319.60491943359375, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -312.2276611328125, + "logps/ref_rejected": -340.19671630859375, + "logps/rejected": -403.5230712890625, + "loss": 0.5561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7377276420593262, + "rewards/grad_term": 0.011820271611213684, + "rewards/margins": 5.594909191131592, + "rewards/rejected": -6.33263635635376, + "step": 233 + }, + { + "epoch": 0.485414235705951, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 42.80927565779465, + "learning_rate": 8.431372549019608e-07, + "logits/chosen": 0.03597773611545563, + "logits/rejected": 0.07471846044063568, + "logps/accuracies": 0.9375, + "logps/chosen": -313.0096435546875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -303.0143737792969, + "logps/ref_rejected": -296.0675354003906, + "logps/rejected": -380.6692199707031, + "loss": 0.5752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9995289444923401, + "rewards/grad_term": 0.003610477549955249, + "rewards/margins": 7.460636615753174, + "rewards/rejected": -8.460165977478027, + "step": 234 + }, + { + "epoch": 0.4874886555166602, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 36.409560354500066, + "learning_rate": 8.419838523644751e-07, + "logits/chosen": 0.33159953355789185, + "logits/rejected": 0.3636232912540436, + "logps/accuracies": 0.75, + "logps/chosen": -399.6930847167969, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -377.0238037109375, + "logps/ref_rejected": -366.9959411621094, + "logps/rejected": -440.5434875488281, + "loss": 0.6419, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2669270038604736, + "rewards/grad_term": 0.01446828804910183, + "rewards/margins": 5.087828159332275, + "rewards/rejected": -7.354754447937012, + "step": 235 + }, + { + "epoch": 0.48956307532736937, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 33.58443547525811, + "learning_rate": 8.408304498269896e-07, + "logits/chosen": 0.41071584820747375, + "logits/rejected": 0.46553879976272583, + "logps/accuracies": 0.875, + "logps/chosen": -272.25848388671875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -277.6263427734375, + "logps/ref_rejected": -272.1466369628906, + "logps/rejected": -331.35101318359375, + "loss": 0.5812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5367855429649353, + "rewards/grad_term": 0.007947854697704315, + "rewards/margins": 6.457221031188965, + "rewards/rejected": -5.920435905456543, + "step": 236 + }, + { + "epoch": 0.4916374951380786, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 32.073065465479466, + "learning_rate": 8.396770472895039e-07, + "logits/chosen": 0.35054367780685425, + "logits/rejected": 0.38058048486709595, + "logps/accuracies": 0.875, + "logps/chosen": -266.0827331542969, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -259.3359375, + "logps/ref_rejected": -264.76947021484375, + "logps/rejected": -329.091552734375, + "loss": 0.5934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6746789216995239, + "rewards/grad_term": 0.01457288395613432, + "rewards/margins": 5.75752592086792, + "rewards/rejected": -6.432204246520996, + "step": 237 + }, + { + "epoch": 0.49371191494878774, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 45.30830579519183, + "learning_rate": 8.385236447520184e-07, + "logits/chosen": 0.4467751979827881, + "logits/rejected": 0.4529315233230591, + "logps/accuracies": 0.8125, + "logps/chosen": -294.876953125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -297.973876953125, + "logps/ref_rejected": -294.9307861328125, + "logps/rejected": -360.9711608886719, + "loss": 0.559, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30968841910362244, + "rewards/grad_term": 0.012534530833363533, + "rewards/margins": 6.913724899291992, + "rewards/rejected": -6.604036808013916, + "step": 238 + }, + { + "epoch": 0.49578633475949696, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 42.12849078368493, + "learning_rate": 8.373702422145328e-07, + "logits/chosen": 0.23043927550315857, + "logits/rejected": 0.4118332862854004, + "logps/accuracies": 0.75, + "logps/chosen": -323.7980041503906, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -326.3075866699219, + "logps/ref_rejected": -390.9226379394531, + "logps/rejected": -446.6671142578125, + "loss": 0.5826, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2509579062461853, + "rewards/grad_term": 0.013378635980188847, + "rewards/margins": 5.825405597686768, + "rewards/rejected": -5.5744476318359375, + "step": 239 + }, + { + "epoch": 0.4978607545702061, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 31.909536036533762, + "learning_rate": 8.362168396770472e-07, + "logits/chosen": 0.16586509346961975, + "logits/rejected": 0.25248032808303833, + "logps/accuracies": 0.875, + "logps/chosen": -282.19866943359375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -285.1270751953125, + "logps/ref_rejected": -276.7912902832031, + "logps/rejected": -345.7266845703125, + "loss": 0.5355, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29284125566482544, + "rewards/grad_term": 0.007976886816322803, + "rewards/margins": 7.186383247375488, + "rewards/rejected": -6.89354133605957, + "step": 240 + }, + { + "epoch": 0.49993517438091534, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 30.525809924473112, + "learning_rate": 8.350634371395616e-07, + "logits/chosen": 0.3237009048461914, + "logits/rejected": 0.40990960597991943, + "logps/accuracies": 0.875, + "logps/chosen": -287.2841796875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -285.5251159667969, + "logps/ref_rejected": -302.3129577636719, + "logps/rejected": -370.07891845703125, + "loss": 0.5961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1759052574634552, + "rewards/grad_term": 0.010726590640842915, + "rewards/margins": 6.6006951332092285, + "rewards/rejected": -6.776600360870361, + "step": 241 + }, + { + "epoch": 0.5020095941916245, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 31.349241798619, + "learning_rate": 8.33910034602076e-07, + "logits/chosen": 0.05717964842915535, + "logits/rejected": 0.0796816349029541, + "logps/accuracies": 0.8125, + "logps/chosen": -295.54229736328125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -297.224365234375, + "logps/ref_rejected": -284.146728515625, + "logps/rejected": -344.3078918457031, + "loss": 0.578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16821032762527466, + "rewards/grad_term": 0.014522448182106018, + "rewards/margins": 6.184324264526367, + "rewards/rejected": -6.016113758087158, + "step": 242 + }, + { + "epoch": 0.5040840140023337, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 33.030305222542474, + "learning_rate": 8.327566320645905e-07, + "logits/chosen": 0.06713651120662689, + "logits/rejected": 0.08016189187765121, + "logps/accuracies": 0.9375, + "logps/chosen": -268.2383117675781, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -267.01324462890625, + "logps/ref_rejected": -280.7307434082031, + "logps/rejected": -350.9720458984375, + "loss": 0.557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12250781059265137, + "rewards/grad_term": 0.006276478059589863, + "rewards/margins": 6.901622772216797, + "rewards/rejected": -7.024130344390869, + "step": 243 + }, + { + "epoch": 0.5061584338130429, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 27.004780238727747, + "learning_rate": 8.31603229527105e-07, + "logits/chosen": 0.04987862706184387, + "logits/rejected": 0.018930042162537575, + "logps/accuracies": 0.75, + "logps/chosen": -304.90655517578125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -304.6758728027344, + "logps/ref_rejected": -300.8800048828125, + "logps/rejected": -368.5686950683594, + "loss": 0.5892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.023071274161338806, + "rewards/grad_term": 0.01271775085479021, + "rewards/margins": 6.745797157287598, + "rewards/rejected": -6.768868923187256, + "step": 244 + }, + { + "epoch": 0.5082328536237521, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 39.398222375890796, + "learning_rate": 8.304498269896193e-07, + "logits/chosen": 0.15408623218536377, + "logits/rejected": 0.16406217217445374, + "logps/accuracies": 0.8125, + "logps/chosen": -318.6749572753906, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -316.9960632324219, + "logps/ref_rejected": -343.3831787109375, + "logps/rejected": -391.7142333984375, + "loss": 0.5758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16788998246192932, + "rewards/grad_term": 0.01882031187415123, + "rewards/margins": 4.6652140617370605, + "rewards/rejected": -4.833104133605957, + "step": 245 + }, + { + "epoch": 0.5103072734344613, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 57.065723685654525, + "learning_rate": 8.292964244521338e-07, + "logits/chosen": 0.21645879745483398, + "logits/rejected": 0.2783927619457245, + "logps/accuracies": 0.75, + "logps/chosen": -304.6559143066406, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -307.0755615234375, + "logps/ref_rejected": -296.75225830078125, + "logps/rejected": -353.28765869140625, + "loss": 0.5619, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.241965651512146, + "rewards/grad_term": 0.016664672642946243, + "rewards/margins": 5.895508289337158, + "rewards/rejected": -5.6535420417785645, + "step": 246 + }, + { + "epoch": 0.5123816932451705, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 32.148990652138416, + "learning_rate": 8.281430219146481e-07, + "logits/chosen": 0.19208469986915588, + "logits/rejected": 0.09641852974891663, + "logps/accuracies": 0.6875, + "logps/chosen": -406.9360046386719, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -399.0845947265625, + "logps/ref_rejected": -377.1170349121094, + "logps/rejected": -460.0271911621094, + "loss": 0.5832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7851426005363464, + "rewards/grad_term": 0.0035166891757398844, + "rewards/margins": 7.505876064300537, + "rewards/rejected": -8.29101848602295, + "step": 247 + }, + { + "epoch": 0.5144561130558797, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 44.560082669866084, + "learning_rate": 8.269896193771626e-07, + "logits/chosen": 0.4188195765018463, + "logits/rejected": 0.4568687975406647, + "logps/accuracies": 0.8125, + "logps/chosen": -319.69805908203125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -304.82037353515625, + "logps/ref_rejected": -328.6085205078125, + "logps/rejected": -397.77398681640625, + "loss": 0.5775, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4877678155899048, + "rewards/grad_term": 0.007945088669657707, + "rewards/margins": 5.428779602050781, + "rewards/rejected": -6.916546821594238, + "step": 248 + }, + { + "epoch": 0.5165305328665889, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 49.37389001042154, + "learning_rate": 8.25836216839677e-07, + "logits/chosen": 0.0985247939825058, + "logits/rejected": 0.13652461767196655, + "logps/accuracies": 0.875, + "logps/chosen": -316.8545837402344, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -325.44244384765625, + "logps/ref_rejected": -335.2078857421875, + "logps/rejected": -413.33251953125, + "loss": 0.5037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8587861657142639, + "rewards/grad_term": 0.004567756317555904, + "rewards/margins": 8.671252250671387, + "rewards/rejected": -7.812466144561768, + "step": 249 + }, + { + "epoch": 0.5186049526772981, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 55.51867340899016, + "learning_rate": 8.246828143021914e-07, + "logits/chosen": 0.28938692808151245, + "logits/rejected": 0.27797943353652954, + "logps/accuracies": 1.0, + "logps/chosen": -322.72845458984375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -325.44219970703125, + "logps/ref_rejected": -326.6589660644531, + "logps/rejected": -402.5311279296875, + "loss": 0.5775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27137255668640137, + "rewards/grad_term": 0.01095657143741846, + "rewards/margins": 7.858592510223389, + "rewards/rejected": -7.587219715118408, + "step": 250 + }, + { + "epoch": 0.5206793724880072, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 56.0035451408766, + "learning_rate": 8.235294117647058e-07, + "logits/chosen": 0.2637563943862915, + "logits/rejected": 0.33145201206207275, + "logps/accuracies": 0.8125, + "logps/chosen": -395.2080078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -379.0206604003906, + "logps/ref_rejected": -411.3179931640625, + "logps/rejected": -493.62921142578125, + "loss": 0.5508, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6187384128570557, + "rewards/grad_term": 0.009982087649405003, + "rewards/margins": 6.612382411956787, + "rewards/rejected": -8.231120109558105, + "step": 251 + }, + { + "epoch": 0.5227537922987164, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 49.66999137567622, + "learning_rate": 8.223760092272203e-07, + "logits/chosen": 0.15164095163345337, + "logits/rejected": 0.20300878584384918, + "logps/accuracies": 0.75, + "logps/chosen": -314.79412841796875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -317.7497253417969, + "logps/ref_rejected": -307.1460876464844, + "logps/rejected": -389.70745849609375, + "loss": 0.5436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2955569326877594, + "rewards/grad_term": 0.00618112925440073, + "rewards/margins": 8.551695823669434, + "rewards/rejected": -8.256139755249023, + "step": 252 + }, + { + "epoch": 0.5248282121094257, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 39.0728876643306, + "learning_rate": 8.212226066897346e-07, + "logits/chosen": 0.4079715609550476, + "logits/rejected": 0.5913187861442566, + "logps/accuracies": 0.875, + "logps/chosen": -299.94525146484375, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -300.0797119140625, + "logps/ref_rejected": -358.2881164550781, + "logps/rejected": -424.64324951171875, + "loss": 0.494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01344829797744751, + "rewards/grad_term": 0.008727732114493847, + "rewards/margins": 6.648958683013916, + "rewards/rejected": -6.635509967803955, + "step": 253 + }, + { + "epoch": 0.5269026319201349, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 31.821418285963002, + "learning_rate": 8.200692041522491e-07, + "logits/chosen": 0.4014374613761902, + "logits/rejected": 0.4343331456184387, + "logps/accuracies": 0.9375, + "logps/chosen": -231.1729278564453, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -230.2028045654297, + "logps/ref_rejected": -237.1446075439453, + "logps/rejected": -303.7888488769531, + "loss": 0.5901, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09701316803693771, + "rewards/grad_term": 0.006798036862164736, + "rewards/margins": 6.56741189956665, + "rewards/rejected": -6.664424896240234, + "step": 254 + }, + { + "epoch": 0.5289770517308441, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 49.53309810118567, + "learning_rate": 8.189158016147634e-07, + "logits/chosen": 0.10699253529310226, + "logits/rejected": 0.11342119425535202, + "logps/accuracies": 0.75, + "logps/chosen": -301.25408935546875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -310.416748046875, + "logps/ref_rejected": -280.7598876953125, + "logps/rejected": -345.6063537597656, + "loss": 0.6471, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9162629842758179, + "rewards/grad_term": 0.009691519662737846, + "rewards/margins": 7.400913715362549, + "rewards/rejected": -6.484650611877441, + "step": 255 + }, + { + "epoch": 0.5310514715415532, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 33.11473943842344, + "learning_rate": 8.17762399077278e-07, + "logits/chosen": 0.09529374539852142, + "logits/rejected": 0.2966251075267792, + "logps/accuracies": 0.6875, + "logps/chosen": -278.4207763671875, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -268.88885498046875, + "logps/ref_rejected": -287.082275390625, + "logps/rejected": -344.5219421386719, + "loss": 0.5951, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9531930088996887, + "rewards/grad_term": 0.021204093471169472, + "rewards/margins": 4.790775775909424, + "rewards/rejected": -5.743968963623047, + "step": 256 + }, + { + "epoch": 0.5310514715415532, + "eval_flips/correct->correct": 0.4334975481033325, + "eval_flips/correct->incorrect": 0.009852216579020023, + "eval_flips/incorrect->correct": 0.3300492465496063, + "eval_flips/incorrect->incorrect": 0.2266009896993637, + "eval_logits/chosen": 0.20908966660499573, + "eval_logits/rejected": 0.25232627987861633, + "eval_logps/accuracies": 0.7635468244552612, + "eval_logps/chosen": -291.91790771484375, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -345.9736328125, + "eval_loss": 0.6100751161575317, + "eval_rewards/accuracies": 0.8768472671508789, + "eval_rewards/chosen": -0.4566830098628998, + "eval_rewards/grad_term": 0.015678314492106438, + "eval_rewards/margins": 5.236079216003418, + "eval_rewards/rejected": -5.69276237487793, + "eval_runtime": 803.7781, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.253, + "step": 256 + }, + { + "epoch": 0.5331258913522624, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 37.54918040748534, + "learning_rate": 8.166089965397924e-07, + "logits/chosen": 0.14258748292922974, + "logits/rejected": 0.1967656910419464, + "logps/accuracies": 0.875, + "logps/chosen": -307.8832702636719, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -313.5946350097656, + "logps/ref_rejected": -304.44818115234375, + "logps/rejected": -381.94158935546875, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5711380243301392, + "rewards/grad_term": 0.004750548396259546, + "rewards/margins": 8.320480346679688, + "rewards/rejected": -7.749342918395996, + "step": 257 + }, + { + "epoch": 0.5352003111629716, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 66.7330393421814, + "learning_rate": 8.154555940023068e-07, + "logits/chosen": 0.3797518014907837, + "logits/rejected": 0.36165231466293335, + "logps/accuracies": 0.8125, + "logps/chosen": -305.75604248046875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -297.5378112792969, + "logps/ref_rejected": -278.42840576171875, + "logps/rejected": -340.3268127441406, + "loss": 0.5622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8218250274658203, + "rewards/grad_term": 0.01640998013317585, + "rewards/margins": 5.368016242980957, + "rewards/rejected": -6.189841270446777, + "step": 258 + }, + { + "epoch": 0.5372747309736808, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 38.0243266757412, + "learning_rate": 8.143021914648212e-07, + "logits/chosen": 0.31451526284217834, + "logits/rejected": 0.3426423668861389, + "logps/accuracies": 0.75, + "logps/chosen": -214.24871826171875, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -212.14849853515625, + "logps/ref_rejected": -200.25625610351562, + "logps/rejected": -251.4813232421875, + "loss": 0.5821, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.210022434592247, + "rewards/grad_term": 0.01893402822315693, + "rewards/margins": 4.912485122680664, + "rewards/rejected": -5.1225080490112305, + "step": 259 + }, + { + "epoch": 0.5393491507843899, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 27.658939795121892, + "learning_rate": 8.131487889273356e-07, + "logits/chosen": 0.1209304928779602, + "logits/rejected": 0.1580743044614792, + "logps/accuracies": 0.6875, + "logps/chosen": -291.9943542480469, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -296.20867919921875, + "logps/ref_rejected": -299.74664306640625, + "logps/rejected": -365.398193359375, + "loss": 0.5422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4214297831058502, + "rewards/grad_term": 0.009202235378324986, + "rewards/margins": 6.986582279205322, + "rewards/rejected": -6.565152645111084, + "step": 260 + }, + { + "epoch": 0.5414235705950992, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 75.72204106158672, + "learning_rate": 8.1199538638985e-07, + "logits/chosen": 0.16947351396083832, + "logits/rejected": 0.18022188544273376, + "logps/accuracies": 0.75, + "logps/chosen": -281.2515869140625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -267.09417724609375, + "logps/ref_rejected": -272.4349060058594, + "logps/rejected": -333.783203125, + "loss": 0.606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4157400131225586, + "rewards/grad_term": 0.01815151423215866, + "rewards/margins": 4.719089984893799, + "rewards/rejected": -6.134829998016357, + "step": 261 + }, + { + "epoch": 0.5434979904058084, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 58.5042523870511, + "learning_rate": 8.108419838523645e-07, + "logits/chosen": 0.18119366466999054, + "logits/rejected": 0.3171493113040924, + "logps/accuracies": 0.875, + "logps/chosen": -298.76116943359375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -292.3468017578125, + "logps/ref_rejected": -334.3692321777344, + "logps/rejected": -419.0423278808594, + "loss": 0.5495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6414406299591064, + "rewards/grad_term": 0.005858146119862795, + "rewards/margins": 7.825870990753174, + "rewards/rejected": -8.467310905456543, + "step": 262 + }, + { + "epoch": 0.5455724102165176, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 16.445083432164537, + "learning_rate": 8.096885813148788e-07, + "logits/chosen": 0.37694644927978516, + "logits/rejected": 0.43579670786857605, + "logps/accuracies": 0.875, + "logps/chosen": -358.7106018066406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -354.29962158203125, + "logps/ref_rejected": -385.1765441894531, + "logps/rejected": -465.8778381347656, + "loss": 0.5401, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44109994173049927, + "rewards/grad_term": 0.005547558423131704, + "rewards/margins": 7.6290283203125, + "rewards/rejected": -8.070128440856934, + "step": 263 + }, + { + "epoch": 0.5476468300272268, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 43.5572613048707, + "learning_rate": 8.085351787773933e-07, + "logits/chosen": 0.3072161078453064, + "logits/rejected": 0.2626444697380066, + "logps/accuracies": 0.8125, + "logps/chosen": -259.9176330566406, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -245.10406494140625, + "logps/ref_rejected": -250.30921936035156, + "logps/rejected": -325.20831298828125, + "loss": 0.5928, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4813560247421265, + "rewards/grad_term": 0.01034230925142765, + "rewards/margins": 6.008551597595215, + "rewards/rejected": -7.489907264709473, + "step": 264 + }, + { + "epoch": 0.5497212498379359, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 42.82203006051057, + "learning_rate": 8.073817762399076e-07, + "logits/chosen": 0.13615286350250244, + "logits/rejected": 0.20001475512981415, + "logps/accuracies": 0.75, + "logps/chosen": -338.6938171386719, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -319.03753662109375, + "logps/ref_rejected": -324.310791015625, + "logps/rejected": -421.2921447753906, + "loss": 0.5756, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9656240940093994, + "rewards/grad_term": 0.006605319678783417, + "rewards/margins": 7.732507705688477, + "rewards/rejected": -9.698131561279297, + "step": 265 + }, + { + "epoch": 0.5517956696486451, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 52.44815820061085, + "learning_rate": 8.062283737024221e-07, + "logits/chosen": 0.21208931505680084, + "logits/rejected": 0.25778982043266296, + "logps/accuracies": 0.875, + "logps/chosen": -344.6730651855469, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -331.10345458984375, + "logps/ref_rejected": -351.76409912109375, + "logps/rejected": -448.85931396484375, + "loss": 0.613, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3569657802581787, + "rewards/grad_term": 0.00555332051590085, + "rewards/margins": 8.352553367614746, + "rewards/rejected": -9.70952033996582, + "step": 266 + }, + { + "epoch": 0.5538700894593543, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 41.602018473752594, + "learning_rate": 8.050749711649365e-07, + "logits/chosen": 0.028799353167414665, + "logits/rejected": 0.013060306198894978, + "logps/accuracies": 0.875, + "logps/chosen": -331.1357727050781, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -303.6278076171875, + "logps/ref_rejected": -292.3013610839844, + "logps/rejected": -394.4818115234375, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.750793933868408, + "rewards/grad_term": 0.004837782587856054, + "rewards/margins": 7.467255115509033, + "rewards/rejected": -10.218048095703125, + "step": 267 + }, + { + "epoch": 0.5559445092700636, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 113.12904360596185, + "learning_rate": 8.03921568627451e-07, + "logits/chosen": 0.02135728858411312, + "logits/rejected": 0.08749254792928696, + "logps/accuracies": 0.8125, + "logps/chosen": -336.35760498046875, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -308.16162109375, + "logps/ref_rejected": -309.57598876953125, + "logps/rejected": -400.8465576171875, + "loss": 0.6239, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8195974826812744, + "rewards/grad_term": 0.018539071083068848, + "rewards/margins": 6.307459831237793, + "rewards/rejected": -9.127056121826172, + "step": 268 + }, + { + "epoch": 0.5580189290807728, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 33.96227826132554, + "learning_rate": 8.027681660899654e-07, + "logits/chosen": 0.5619252324104309, + "logits/rejected": 0.5743327736854553, + "logps/accuracies": 0.9375, + "logps/chosen": -255.11105346679688, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -252.5300750732422, + "logps/ref_rejected": -270.99432373046875, + "logps/rejected": -342.1889953613281, + "loss": 0.5774, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2580994963645935, + "rewards/grad_term": 0.01020655408501625, + "rewards/margins": 6.861366271972656, + "rewards/rejected": -7.1194658279418945, + "step": 269 + }, + { + "epoch": 0.5600933488914819, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 24.02468529781505, + "learning_rate": 8.016147635524798e-07, + "logits/chosen": 0.27724260091781616, + "logits/rejected": 0.2910709083080292, + "logps/accuracies": 0.8125, + "logps/chosen": -300.6127014160156, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -277.4232482910156, + "logps/ref_rejected": -294.59234619140625, + "logps/rejected": -373.547607421875, + "loss": 0.5739, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3189470767974854, + "rewards/grad_term": 0.014720053412020206, + "rewards/margins": 5.57658052444458, + "rewards/rejected": -7.895526885986328, + "step": 270 + }, + { + "epoch": 0.5621677687021911, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 28.444387072372734, + "learning_rate": 8.004613610149942e-07, + "logits/chosen": 0.05013295263051987, + "logits/rejected": 0.06607392430305481, + "logps/accuracies": 0.9375, + "logps/chosen": -278.4317626953125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -279.4646301269531, + "logps/ref_rejected": -289.01959228515625, + "logps/rejected": -344.6971130371094, + "loss": 0.5915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10328565537929535, + "rewards/grad_term": 0.00966467522084713, + "rewards/margins": 5.671037673950195, + "rewards/rejected": -5.567751884460449, + "step": 271 + }, + { + "epoch": 0.5642421885129003, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 51.22016296104716, + "learning_rate": 7.993079584775087e-07, + "logits/chosen": 0.37125492095947266, + "logits/rejected": 0.3817085325717926, + "logps/accuracies": 0.75, + "logps/chosen": -308.3736572265625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -300.4655456542969, + "logps/ref_rejected": -302.07696533203125, + "logps/rejected": -366.42919921875, + "loss": 0.5656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7908135652542114, + "rewards/grad_term": 0.007430646568536758, + "rewards/margins": 5.644411563873291, + "rewards/rejected": -6.435225009918213, + "step": 272 + }, + { + "epoch": 0.5663166083236095, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 32.46457915793264, + "learning_rate": 7.98154555940023e-07, + "logits/chosen": 0.17831876873970032, + "logits/rejected": 0.1457141786813736, + "logps/accuracies": 0.75, + "logps/chosen": -282.64410400390625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -274.8734130859375, + "logps/ref_rejected": -274.3963317871094, + "logps/rejected": -331.91021728515625, + "loss": 0.5866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7770657539367676, + "rewards/grad_term": 0.016797857359051704, + "rewards/margins": 4.974320411682129, + "rewards/rejected": -5.751385688781738, + "step": 273 + }, + { + "epoch": 0.5683910281343186, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 32.9471484951805, + "learning_rate": 7.970011534025375e-07, + "logits/chosen": 0.5048956871032715, + "logits/rejected": 0.496670126914978, + "logps/accuracies": 0.8125, + "logps/chosen": -301.3432922363281, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -304.7401123046875, + "logps/ref_rejected": -310.8699645996094, + "logps/rejected": -359.89007568359375, + "loss": 0.617, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33968228101730347, + "rewards/grad_term": 0.01532800029963255, + "rewards/margins": 5.24169397354126, + "rewards/rejected": -4.902011871337891, + "step": 274 + }, + { + "epoch": 0.5704654479450278, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 26.58072307751858, + "learning_rate": 7.958477508650518e-07, + "logits/chosen": 0.1677144318819046, + "logits/rejected": 0.2207137495279312, + "logps/accuracies": 0.75, + "logps/chosen": -240.13754272460938, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -248.08505249023438, + "logps/ref_rejected": -233.90797424316406, + "logps/rejected": -271.0197448730469, + "loss": 0.6325, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7947514653205872, + "rewards/grad_term": 0.017878375947475433, + "rewards/margins": 4.505929946899414, + "rewards/rejected": -3.711178779602051, + "step": 275 + }, + { + "epoch": 0.5725398677557371, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 41.27546406513188, + "learning_rate": 7.946943483275663e-07, + "logits/chosen": 0.36588138341903687, + "logits/rejected": 0.4112645983695984, + "logps/accuracies": 0.75, + "logps/chosen": -253.30035400390625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -247.00111389160156, + "logps/ref_rejected": -257.2711486816406, + "logps/rejected": -302.595703125, + "loss": 0.6566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6299245357513428, + "rewards/grad_term": 0.020320266485214233, + "rewards/margins": 3.9025347232818604, + "rewards/rejected": -4.532459259033203, + "step": 276 + }, + { + "epoch": 0.5746142875664463, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 65.27110995983777, + "learning_rate": 7.935409457900807e-07, + "logits/chosen": -0.01693597435951233, + "logits/rejected": 0.046872012317180634, + "logps/accuracies": 0.625, + "logps/chosen": -275.9945373535156, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -282.8183898925781, + "logps/ref_rejected": -310.315673828125, + "logps/rejected": -344.1127014160156, + "loss": 0.6913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.682384192943573, + "rewards/grad_term": 0.023220881819725037, + "rewards/margins": 4.062088489532471, + "rewards/rejected": -3.379704475402832, + "step": 277 + }, + { + "epoch": 0.5766887073771555, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 51.19472036277012, + "learning_rate": 7.923875432525951e-07, + "logits/chosen": 0.19429253041744232, + "logits/rejected": 0.19109566509723663, + "logps/accuracies": 0.6875, + "logps/chosen": -299.841796875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -306.51116943359375, + "logps/ref_rejected": -324.4086608886719, + "logps/rejected": -347.66070556640625, + "loss": 0.7188, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.666935920715332, + "rewards/grad_term": 0.025834256783127785, + "rewards/margins": 2.9921374320983887, + "rewards/rejected": -2.3252012729644775, + "step": 278 + }, + { + "epoch": 0.5787631271878646, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 111.63212030223526, + "learning_rate": 7.912341407151095e-07, + "logits/chosen": 0.026315703988075256, + "logits/rejected": 0.05185367166996002, + "logps/accuracies": 0.5625, + "logps/chosen": -312.5485534667969, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -322.3186340332031, + "logps/ref_rejected": -310.2381286621094, + "logps/rejected": -346.5958251953125, + "loss": 0.6961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9770085215568542, + "rewards/grad_term": 0.019868649542331696, + "rewards/margins": 4.612778663635254, + "rewards/rejected": -3.635770082473755, + "step": 279 + }, + { + "epoch": 0.5808375469985738, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 40.5130776185035, + "learning_rate": 7.90080738177624e-07, + "logits/chosen": 0.32091373205184937, + "logits/rejected": 0.4111550450325012, + "logps/accuracies": 0.75, + "logps/chosen": -203.61636352539062, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -205.00054931640625, + "logps/ref_rejected": -283.1324462890625, + "logps/rejected": -327.993896484375, + "loss": 0.6669, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1384190022945404, + "rewards/grad_term": 0.016185998916625977, + "rewards/margins": 4.624567985534668, + "rewards/rejected": -4.486148357391357, + "step": 280 + }, + { + "epoch": 0.582911966809283, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 39.877534822937, + "learning_rate": 7.889273356401384e-07, + "logits/chosen": 0.3415575325489044, + "logits/rejected": 0.360428124666214, + "logps/accuracies": 0.75, + "logps/chosen": -321.03564453125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -328.7980651855469, + "logps/ref_rejected": -312.2154541015625, + "logps/rejected": -357.23773193359375, + "loss": 0.6027, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7762415409088135, + "rewards/grad_term": 0.020651506260037422, + "rewards/margins": 5.278467655181885, + "rewards/rejected": -4.50222635269165, + "step": 281 + }, + { + "epoch": 0.5849863866199922, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 20.370696721525093, + "learning_rate": 7.877739331026529e-07, + "logits/chosen": -0.10976716130971909, + "logits/rejected": 0.04267115890979767, + "logps/accuracies": 0.5625, + "logps/chosen": -316.794189453125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -321.988525390625, + "logps/ref_rejected": -346.69873046875, + "logps/rejected": -393.5243225097656, + "loss": 0.5422, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.519432783126831, + "rewards/grad_term": 0.019725538790225983, + "rewards/margins": 5.201993942260742, + "rewards/rejected": -4.682560920715332, + "step": 282 + }, + { + "epoch": 0.5870608064307015, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 38.36199057696001, + "learning_rate": 7.866205305651672e-07, + "logits/chosen": 0.11121785640716553, + "logits/rejected": 0.21377022564411163, + "logps/accuracies": 0.9375, + "logps/chosen": -260.2328186035156, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -243.456298828125, + "logps/ref_rejected": -277.0938720703125, + "logps/rejected": -323.23675537109375, + "loss": 0.5608, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6776524782180786, + "rewards/grad_term": 0.029680585488677025, + "rewards/margins": 2.9366343021392822, + "rewards/rejected": -4.61428689956665, + "step": 283 + }, + { + "epoch": 0.5891352262414106, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 18.32697126494198, + "learning_rate": 7.854671280276817e-07, + "logits/chosen": 0.09643738716840744, + "logits/rejected": 0.13956782221794128, + "logps/accuracies": 0.6875, + "logps/chosen": -354.23077392578125, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -343.4710388183594, + "logps/ref_rejected": -336.4991760253906, + "logps/rejected": -402.7376403808594, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.075973629951477, + "rewards/grad_term": 0.011539540253579617, + "rewards/margins": 5.547872543334961, + "rewards/rejected": -6.623846054077148, + "step": 284 + }, + { + "epoch": 0.5912096460521198, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 52.40142941235436, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": 0.11267786473035812, + "logits/rejected": 0.16389338672161102, + "logps/accuracies": 0.875, + "logps/chosen": -330.61627197265625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -335.897216796875, + "logps/ref_rejected": -348.2872619628906, + "logps/rejected": -430.4730224609375, + "loss": 0.5283, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5280970931053162, + "rewards/grad_term": 0.00416451646015048, + "rewards/margins": 8.746676445007324, + "rewards/rejected": -8.218579292297363, + "step": 285 + }, + { + "epoch": 0.593284065862829, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 47.45081130082864, + "learning_rate": 7.831603229527105e-07, + "logits/chosen": -0.08656018227338791, + "logits/rejected": -0.05061071738600731, + "logps/accuracies": 0.875, + "logps/chosen": -304.7203369140625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -296.37933349609375, + "logps/ref_rejected": -303.2038269042969, + "logps/rejected": -393.2415771484375, + "loss": 0.5669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8340997695922852, + "rewards/grad_term": 0.011894619092345238, + "rewards/margins": 8.169673919677734, + "rewards/rejected": -9.003772735595703, + "step": 286 + }, + { + "epoch": 0.5953584856735382, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 21.830125532823576, + "learning_rate": 7.820069204152249e-07, + "logits/chosen": 0.16923511028289795, + "logits/rejected": 0.16749337315559387, + "logps/accuracies": 0.8125, + "logps/chosen": -300.83929443359375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -280.837158203125, + "logps/ref_rejected": -279.3500671386719, + "logps/rejected": -371.1993103027344, + "loss": 0.5992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.000213623046875, + "rewards/grad_term": 0.007134607993066311, + "rewards/margins": 7.1847124099731445, + "rewards/rejected": -9.184926986694336, + "step": 287 + }, + { + "epoch": 0.5974329054842473, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 29.122328000376466, + "learning_rate": 7.808535178777393e-07, + "logits/chosen": 0.027427153661847115, + "logits/rejected": 0.04789198189973831, + "logps/accuracies": 0.8125, + "logps/chosen": -390.447509765625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -367.87188720703125, + "logps/ref_rejected": -355.06640625, + "logps/rejected": -450.7095031738281, + "loss": 0.5902, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2575621604919434, + "rewards/grad_term": 0.009585607796907425, + "rewards/margins": 7.306746482849121, + "rewards/rejected": -9.564309120178223, + "step": 288 + }, + { + "epoch": 0.5995073252949565, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 41.33190625329714, + "learning_rate": 7.797001153402537e-07, + "logits/chosen": 0.07080674171447754, + "logits/rejected": 0.10809577256441116, + "logps/accuracies": 0.75, + "logps/chosen": -280.20654296875, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -284.79046630859375, + "logps/ref_rejected": -268.11834716796875, + "logps/rejected": -343.5045166015625, + "loss": 0.5622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45839110016822815, + "rewards/grad_term": 0.006786561571061611, + "rewards/margins": 7.997011661529541, + "rewards/rejected": -7.5386199951171875, + "step": 289 + }, + { + "epoch": 0.6015817451056658, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 22.37995830872316, + "learning_rate": 7.785467128027681e-07, + "logits/chosen": 0.046341296285390854, + "logits/rejected": 0.08901657164096832, + "logps/accuracies": 1.0, + "logps/chosen": -315.9812927246094, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -314.4349365234375, + "logps/ref_rejected": -321.4173583984375, + "logps/rejected": -420.6228942871094, + "loss": 0.5426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15463726222515106, + "rewards/grad_term": 0.0006936362478882074, + "rewards/margins": 9.76591682434082, + "rewards/rejected": -9.920555114746094, + "step": 290 + }, + { + "epoch": 0.603656164916375, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 31.611576893068335, + "learning_rate": 7.773933102652825e-07, + "logits/chosen": 0.1491805762052536, + "logits/rejected": 0.1622200310230255, + "logps/accuracies": 0.875, + "logps/chosen": -324.14556884765625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -316.1439208984375, + "logps/ref_rejected": -310.8943176269531, + "logps/rejected": -400.73760986328125, + "loss": 0.5327, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8001646995544434, + "rewards/grad_term": 0.008960644714534283, + "rewards/margins": 8.184164047241211, + "rewards/rejected": -8.984328269958496, + "step": 291 + }, + { + "epoch": 0.6057305847270842, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 40.66113218146233, + "learning_rate": 7.76239907727797e-07, + "logits/chosen": 0.1667100340127945, + "logits/rejected": 0.1031753420829773, + "logps/accuracies": 0.875, + "logps/chosen": -257.600830078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -251.26223754882812, + "logps/ref_rejected": -256.9619445800781, + "logps/rejected": -332.1715393066406, + "loss": 0.5644, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6338596940040588, + "rewards/grad_term": 0.00917502585798502, + "rewards/margins": 6.887094974517822, + "rewards/rejected": -7.5209550857543945, + "step": 292 + }, + { + "epoch": 0.6078050045377933, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 57.97079310383917, + "learning_rate": 7.750865051903114e-07, + "logits/chosen": -0.08806827664375305, + "logits/rejected": -0.045274168252944946, + "logps/accuracies": 0.75, + "logps/chosen": -308.90509033203125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -310.9958190917969, + "logps/ref_rejected": -308.5406494140625, + "logps/rejected": -372.4405212402344, + "loss": 0.54, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20907306671142578, + "rewards/grad_term": 0.0065223718993365765, + "rewards/margins": 6.599061489105225, + "rewards/rejected": -6.389988899230957, + "step": 293 + }, + { + "epoch": 0.6098794243485025, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 49.04536258498517, + "learning_rate": 7.739331026528259e-07, + "logits/chosen": 0.17086654901504517, + "logits/rejected": 0.19536878168582916, + "logps/accuracies": 0.6875, + "logps/chosen": -318.17401123046875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -322.79205322265625, + "logps/ref_rejected": -298.66278076171875, + "logps/rejected": -354.1503601074219, + "loss": 0.5665, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4618016183376312, + "rewards/grad_term": 0.015101278200745583, + "rewards/margins": 6.010561466217041, + "rewards/rejected": -5.548760414123535, + "step": 294 + }, + { + "epoch": 0.6119538441592117, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 29.110328186799425, + "learning_rate": 7.727797001153403e-07, + "logits/chosen": 0.2920646667480469, + "logits/rejected": 0.3339766263961792, + "logps/accuracies": 0.875, + "logps/chosen": -287.7585144042969, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -291.96600341796875, + "logps/ref_rejected": -315.6741027832031, + "logps/rejected": -371.0801696777344, + "loss": 0.5743, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4207479953765869, + "rewards/grad_term": 0.013844680972397327, + "rewards/margins": 5.96135139465332, + "rewards/rejected": -5.540602684020996, + "step": 295 + }, + { + "epoch": 0.6140282639699209, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 53.230756957534005, + "learning_rate": 7.716262975778547e-07, + "logits/chosen": 0.12378720194101334, + "logits/rejected": 0.16028910875320435, + "logps/accuracies": 0.8125, + "logps/chosen": -291.1141357421875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -294.4344482421875, + "logps/ref_rejected": -296.4821472167969, + "logps/rejected": -359.0552062988281, + "loss": 0.5617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33203190565109253, + "rewards/grad_term": 0.005090603604912758, + "rewards/margins": 6.58933687210083, + "rewards/rejected": -6.257304668426514, + "step": 296 + }, + { + "epoch": 0.6161026837806302, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 49.27245845537937, + "learning_rate": 7.704728950403691e-07, + "logits/chosen": 0.07910759747028351, + "logits/rejected": 0.08938741683959961, + "logps/accuracies": 0.8125, + "logps/chosen": -336.2572021484375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -347.0129089355469, + "logps/ref_rejected": -345.33203125, + "logps/rejected": -390.4123229980469, + "loss": 0.5452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0755696296691895, + "rewards/grad_term": 0.01720615103840828, + "rewards/margins": 5.583600044250488, + "rewards/rejected": -4.508030414581299, + "step": 297 + }, + { + "epoch": 0.6181771035913393, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 52.587110731751046, + "learning_rate": 7.693194925028835e-07, + "logits/chosen": 0.09930308163166046, + "logits/rejected": 0.21927960216999054, + "logps/accuracies": 0.75, + "logps/chosen": -221.76551818847656, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -217.18284606933594, + "logps/ref_rejected": -224.5638427734375, + "logps/rejected": -282.2660217285156, + "loss": 0.5868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4582689702510834, + "rewards/grad_term": 0.015134723857045174, + "rewards/margins": 5.311949253082275, + "rewards/rejected": -5.7702178955078125, + "step": 298 + }, + { + "epoch": 0.6202515234020485, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 50.13841731685137, + "learning_rate": 7.681660899653979e-07, + "logits/chosen": 0.05825243890285492, + "logits/rejected": 0.1010754331946373, + "logps/accuracies": 0.875, + "logps/chosen": -372.16961669921875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -380.7333679199219, + "logps/ref_rejected": -376.558349609375, + "logps/rejected": -447.0321350097656, + "loss": 0.4912, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8563790321350098, + "rewards/grad_term": 0.006374266929924488, + "rewards/margins": 7.903756141662598, + "rewards/rejected": -7.047377586364746, + "step": 299 + }, + { + "epoch": 0.6223259432127577, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 18.47274167497265, + "learning_rate": 7.670126874279122e-07, + "logits/chosen": 0.01922018826007843, + "logits/rejected": 0.10939830541610718, + "logps/accuracies": 0.6875, + "logps/chosen": -290.3101806640625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -292.36676025390625, + "logps/ref_rejected": -279.2781066894531, + "logps/rejected": -335.0788269042969, + "loss": 0.5304, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20565718412399292, + "rewards/grad_term": 0.012530253268778324, + "rewards/margins": 5.785726547241211, + "rewards/rejected": -5.580069541931152, + "step": 300 + }, + { + "epoch": 0.6244003630234669, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 35.40752217083433, + "learning_rate": 7.658592848904267e-07, + "logits/chosen": 0.2818371653556824, + "logits/rejected": 0.41613805294036865, + "logps/accuracies": 0.9375, + "logps/chosen": -254.01771545410156, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -242.12171936035156, + "logps/ref_rejected": -286.4274597167969, + "logps/rejected": -363.48065185546875, + "loss": 0.5697, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.189597725868225, + "rewards/grad_term": 0.010368636809289455, + "rewards/margins": 6.51572322845459, + "rewards/rejected": -7.705321311950684, + "step": 301 + }, + { + "epoch": 0.626474782834176, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 56.807968171838894, + "learning_rate": 7.647058823529411e-07, + "logits/chosen": 0.23013733327388763, + "logits/rejected": 0.2862010598182678, + "logps/accuracies": 0.9375, + "logps/chosen": -326.96173095703125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -329.29400634765625, + "logps/ref_rejected": -335.8630676269531, + "logps/rejected": -414.28753662109375, + "loss": 0.4896, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23322616517543793, + "rewards/grad_term": 0.0038325442001223564, + "rewards/margins": 8.075675010681152, + "rewards/rejected": -7.842449188232422, + "step": 302 + }, + { + "epoch": 0.6285492026448852, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 30.936866332774525, + "learning_rate": 7.635524798154555e-07, + "logits/chosen": 0.38763344287872314, + "logits/rejected": 0.42555707693099976, + "logps/accuracies": 0.8125, + "logps/chosen": -309.457763671875, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -298.310791015625, + "logps/ref_rejected": -305.3328857421875, + "logps/rejected": -386.14520263671875, + "loss": 0.5817, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1147011518478394, + "rewards/grad_term": 0.008538071066141129, + "rewards/margins": 6.966533660888672, + "rewards/rejected": -8.0812349319458, + "step": 303 + }, + { + "epoch": 0.6306236224555944, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 39.0549516486327, + "learning_rate": 7.623990772779699e-07, + "logits/chosen": 0.3378972113132477, + "logits/rejected": 0.33350008726119995, + "logps/accuracies": 0.75, + "logps/chosen": -296.42120361328125, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -286.9034423828125, + "logps/ref_rejected": -254.8472137451172, + "logps/rejected": -342.2765808105469, + "loss": 0.5692, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9517745971679688, + "rewards/grad_term": 0.009382160380482674, + "rewards/margins": 7.791163921356201, + "rewards/rejected": -8.742938995361328, + "step": 304 + }, + { + "epoch": 0.6326980422663037, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 25.487656661381987, + "learning_rate": 7.612456747404843e-07, + "logits/chosen": -0.0026643723249435425, + "logits/rejected": 0.13260780274868011, + "logps/accuracies": 0.8125, + "logps/chosen": -337.88592529296875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -335.2144775390625, + "logps/ref_rejected": -367.57647705078125, + "logps/rejected": -445.0696716308594, + "loss": 0.5336, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2671446204185486, + "rewards/grad_term": 0.009766257368028164, + "rewards/margins": 7.482178211212158, + "rewards/rejected": -7.749322891235352, + "step": 305 + }, + { + "epoch": 0.6347724620770129, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 53.78629399737712, + "learning_rate": 7.600922722029988e-07, + "logits/chosen": 0.17397280037403107, + "logits/rejected": 0.13213837146759033, + "logps/accuracies": 0.875, + "logps/chosen": -296.3286437988281, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -288.9657897949219, + "logps/ref_rejected": -271.7525329589844, + "logps/rejected": -347.7431640625, + "loss": 0.6198, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7362870573997498, + "rewards/grad_term": 0.008981076069176197, + "rewards/margins": 6.862776756286621, + "rewards/rejected": -7.599064826965332, + "step": 306 + }, + { + "epoch": 0.636846881887722, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 38.57273088962716, + "learning_rate": 7.589388696655133e-07, + "logits/chosen": 0.16268330812454224, + "logits/rejected": 0.30625462532043457, + "logps/accuracies": 1.0, + "logps/chosen": -300.149169921875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -308.031982421875, + "logps/ref_rejected": -308.461181640625, + "logps/rejected": -392.94122314453125, + "loss": 0.52, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7882769703865051, + "rewards/grad_term": 0.0005324217490851879, + "rewards/margins": 9.236281394958496, + "rewards/rejected": -8.448005676269531, + "step": 307 + }, + { + "epoch": 0.6389213016984312, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 14.679022976693457, + "learning_rate": 7.577854671280276e-07, + "logits/chosen": 0.11276095360517502, + "logits/rejected": 0.15488047897815704, + "logps/accuracies": 0.75, + "logps/chosen": -320.9672546386719, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -317.51580810546875, + "logps/ref_rejected": -312.4486999511719, + "logps/rejected": -392.2602844238281, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3451465368270874, + "rewards/grad_term": 0.005722560919821262, + "rewards/margins": 7.636013507843018, + "rewards/rejected": -7.9811601638793945, + "step": 308 + }, + { + "epoch": 0.6409957215091404, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 44.76031279155951, + "learning_rate": 7.566320645905421e-07, + "logits/chosen": 0.15411929786205292, + "logits/rejected": 0.17396250367164612, + "logps/accuracies": 1.0, + "logps/chosen": -271.82232666015625, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -267.814697265625, + "logps/ref_rejected": -285.9453430175781, + "logps/rejected": -360.15850830078125, + "loss": 0.5586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.40076228976249695, + "rewards/grad_term": 0.009672937914729118, + "rewards/margins": 7.020550727844238, + "rewards/rejected": -7.421313762664795, + "step": 309 + }, + { + "epoch": 0.6430701413198496, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 53.09216498794514, + "learning_rate": 7.554786620530565e-07, + "logits/chosen": 0.1644693911075592, + "logits/rejected": 0.23783330619335175, + "logps/accuracies": 0.9375, + "logps/chosen": -335.7210693359375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -341.6274719238281, + "logps/ref_rejected": -344.434814453125, + "logps/rejected": -419.43768310546875, + "loss": 0.5501, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5906396508216858, + "rewards/grad_term": 0.006133922841399908, + "rewards/margins": 8.090925216674805, + "rewards/rejected": -7.500285625457764, + "step": 310 + }, + { + "epoch": 0.6451445611305588, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 41.39136172238844, + "learning_rate": 7.543252595155709e-07, + "logits/chosen": 0.06381943821907043, + "logits/rejected": 0.08010812848806381, + "logps/accuracies": 0.8125, + "logps/chosen": -213.8368682861328, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -210.74017333984375, + "logps/ref_rejected": -222.75961303710938, + "logps/rejected": -290.5465087890625, + "loss": 0.5381, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.30966925621032715, + "rewards/grad_term": 0.008784075267612934, + "rewards/margins": 6.469019889831543, + "rewards/rejected": -6.778688907623291, + "step": 311 + }, + { + "epoch": 0.647218980941268, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 65.96838712092502, + "learning_rate": 7.531718569780853e-07, + "logits/chosen": 0.14972330629825592, + "logits/rejected": 0.19796700775623322, + "logps/accuracies": 0.8125, + "logps/chosen": -271.4801940917969, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -271.1293029785156, + "logps/ref_rejected": -286.2263488769531, + "logps/rejected": -364.43609619140625, + "loss": 0.4888, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.035090282559394836, + "rewards/grad_term": 0.005572815891355276, + "rewards/margins": 7.785881042480469, + "rewards/rejected": -7.8209710121154785, + "step": 312 + }, + { + "epoch": 0.6492934007519772, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 65.98088553626904, + "learning_rate": 7.520184544405997e-07, + "logits/chosen": 0.14395220577716827, + "logits/rejected": 0.09385178238153458, + "logps/accuracies": 0.8125, + "logps/chosen": -357.7359924316406, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -338.0129089355469, + "logps/ref_rejected": -343.6756896972656, + "logps/rejected": -418.16510009765625, + "loss": 0.563, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.972308874130249, + "rewards/grad_term": 0.013005997985601425, + "rewards/margins": 5.476626873016357, + "rewards/rejected": -7.4489359855651855, + "step": 313 + }, + { + "epoch": 0.6513678205626864, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 56.93501034330218, + "learning_rate": 7.508650519031141e-07, + "logits/chosen": 0.13935233652591705, + "logits/rejected": 0.19025281071662903, + "logps/accuracies": 0.8125, + "logps/chosen": -225.3142547607422, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -217.11978149414062, + "logps/ref_rejected": -215.02268981933594, + "logps/rejected": -280.1806640625, + "loss": 0.5613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8194477558135986, + "rewards/grad_term": 0.01098605990409851, + "rewards/margins": 5.696350574493408, + "rewards/rejected": -6.5157976150512695, + "step": 314 + }, + { + "epoch": 0.6534422403733956, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 47.21672943404337, + "learning_rate": 7.497116493656286e-07, + "logits/chosen": 0.12312566488981247, + "logits/rejected": 0.11346716433763504, + "logps/accuracies": 0.9375, + "logps/chosen": -278.6199951171875, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -275.3497009277344, + "logps/ref_rejected": -268.13726806640625, + "logps/rejected": -360.0812072753906, + "loss": 0.627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32702964544296265, + "rewards/grad_term": 0.00423807417973876, + "rewards/margins": 8.867365837097168, + "rewards/rejected": -9.194396018981934, + "step": 315 + }, + { + "epoch": 0.6555166601841047, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 42.73491649478954, + "learning_rate": 7.485582468281429e-07, + "logits/chosen": 0.12578445672988892, + "logits/rejected": 0.11325564980506897, + "logps/accuracies": 0.9375, + "logps/chosen": -312.9777526855469, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -294.9630432128906, + "logps/ref_rejected": -309.66436767578125, + "logps/rejected": -403.28887939453125, + "loss": 0.5566, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8014687299728394, + "rewards/grad_term": 0.006151386070996523, + "rewards/margins": 7.560985088348389, + "rewards/rejected": -9.36245346069336, + "step": 316 + }, + { + "epoch": 0.6575910799948139, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 26.036325870194347, + "learning_rate": 7.474048442906574e-07, + "logits/chosen": 0.2790083587169647, + "logits/rejected": 0.30802425742149353, + "logps/accuracies": 0.875, + "logps/chosen": -284.39532470703125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -284.0159606933594, + "logps/ref_rejected": -297.21063232421875, + "logps/rejected": -367.1194763183594, + "loss": 0.5027, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03793831914663315, + "rewards/grad_term": 0.00753421988338232, + "rewards/margins": 6.952947616577148, + "rewards/rejected": -6.9908857345581055, + "step": 317 + }, + { + "epoch": 0.6596654998055231, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 48.848208501371566, + "learning_rate": 7.462514417531717e-07, + "logits/chosen": 0.23110151290893555, + "logits/rejected": 0.23169729113578796, + "logps/accuracies": 0.8125, + "logps/chosen": -345.29351806640625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -346.4335632324219, + "logps/ref_rejected": -320.21112060546875, + "logps/rejected": -389.7298889160156, + "loss": 0.5338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11400166153907776, + "rewards/grad_term": 0.005073026288300753, + "rewards/margins": 7.065882205963135, + "rewards/rejected": -6.951880931854248, + "step": 318 + }, + { + "epoch": 0.6617399196162324, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 37.43721667408988, + "learning_rate": 7.450980392156863e-07, + "logits/chosen": -0.11392828822135925, + "logits/rejected": -0.1736583262681961, + "logps/accuracies": 0.5625, + "logps/chosen": -349.2891540527344, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -354.2510986328125, + "logps/ref_rejected": -316.1826477050781, + "logps/rejected": -374.9449768066406, + "loss": 0.5794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4961950182914734, + "rewards/grad_term": 0.01609090529382229, + "rewards/margins": 6.372428894042969, + "rewards/rejected": -5.87623405456543, + "step": 319 + }, + { + "epoch": 0.6638143394269416, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 48.93249687471918, + "learning_rate": 7.439446366782007e-07, + "logits/chosen": 0.22672495245933533, + "logits/rejected": 0.23626157641410828, + "logps/accuracies": 0.875, + "logps/chosen": -300.6910095214844, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -306.4937744140625, + "logps/ref_rejected": -316.8069763183594, + "logps/rejected": -371.29150390625, + "loss": 0.579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5802759528160095, + "rewards/grad_term": 0.011157616972923279, + "rewards/margins": 6.02873420715332, + "rewards/rejected": -5.448458194732666, + "step": 320 + }, + { + "epoch": 0.6638143394269416, + "eval_flips/correct->correct": 0.43842363357543945, + "eval_flips/correct->incorrect": 0.004926108289510012, + "eval_flips/incorrect->correct": 0.30049261450767517, + "eval_flips/incorrect->incorrect": 0.25615763664245605, + "eval_logits/chosen": 0.15680116415023804, + "eval_logits/rejected": 0.20004509389400482, + "eval_logps/accuracies": 0.738916277885437, + "eval_logps/chosen": -288.21343994140625, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -336.36444091796875, + "eval_loss": 0.6191994547843933, + "eval_rewards/accuracies": 0.871921181678772, + "eval_rewards/chosen": -0.08623380959033966, + "eval_rewards/grad_term": 0.017411047592759132, + "eval_rewards/margins": 4.645606994628906, + "eval_rewards/rejected": -4.731841087341309, + "eval_runtime": 800.1629, + "eval_samples_per_second": 2.022, + "eval_steps_per_second": 0.254, + "step": 320 + }, + { + "epoch": 0.6658887592376507, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 89.14243856965115, + "learning_rate": 7.427912341407151e-07, + "logits/chosen": 0.25495031476020813, + "logits/rejected": 0.36095547676086426, + "logps/accuracies": 0.875, + "logps/chosen": -296.5644836425781, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -300.39422607421875, + "logps/ref_rejected": -350.05474853515625, + "logps/rejected": -397.4334716796875, + "loss": 0.6239, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.38297611474990845, + "rewards/grad_term": 0.014660445041954517, + "rewards/margins": 5.120844841003418, + "rewards/rejected": -4.7378692626953125, + "step": 321 + }, + { + "epoch": 0.6679631790483599, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 16.638863764418918, + "learning_rate": 7.416378316032295e-07, + "logits/chosen": -0.062116291373968124, + "logits/rejected": 0.08867709338665009, + "logps/accuracies": 0.625, + "logps/chosen": -348.11456298828125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -352.7536315917969, + "logps/ref_rejected": -350.36370849609375, + "logps/rejected": -394.8106689453125, + "loss": 0.5791, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4639059007167816, + "rewards/grad_term": 0.016094159334897995, + "rewards/margins": 4.908601760864258, + "rewards/rejected": -4.444696426391602, + "step": 322 + }, + { + "epoch": 0.6700375988590691, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 69.32056292827173, + "learning_rate": 7.404844290657439e-07, + "logits/chosen": 0.2704838514328003, + "logits/rejected": 0.2706650495529175, + "logps/accuracies": 0.75, + "logps/chosen": -313.2464599609375, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -315.1943664550781, + "logps/ref_rejected": -296.2995300292969, + "logps/rejected": -356.75299072265625, + "loss": 0.5689, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.19478662312030792, + "rewards/grad_term": 0.016283176839351654, + "rewards/margins": 6.240136623382568, + "rewards/rejected": -6.045351028442383, + "step": 323 + }, + { + "epoch": 0.6721120186697783, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 53.57049601603343, + "learning_rate": 7.393310265282583e-07, + "logits/chosen": 0.24967102706432343, + "logits/rejected": 0.2552967667579651, + "logps/accuracies": 0.8125, + "logps/chosen": -270.054443359375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -260.4380187988281, + "logps/ref_rejected": -254.31671142578125, + "logps/rejected": -320.3769836425781, + "loss": 0.5592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9616467952728271, + "rewards/grad_term": 0.017603037878870964, + "rewards/margins": 5.644383430480957, + "rewards/rejected": -6.606029987335205, + "step": 324 + }, + { + "epoch": 0.6741864384804875, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 54.49892498689122, + "learning_rate": 7.381776239907728e-07, + "logits/chosen": 0.22812658548355103, + "logits/rejected": 0.2515120506286621, + "logps/accuracies": 0.9375, + "logps/chosen": -324.39599609375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -317.1215515136719, + "logps/ref_rejected": -327.36962890625, + "logps/rejected": -387.77032470703125, + "loss": 0.5858, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7274415493011475, + "rewards/grad_term": 0.015195751562714577, + "rewards/margins": 5.312624931335449, + "rewards/rejected": -6.040066242218018, + "step": 325 + }, + { + "epoch": 0.6762608582911966, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 48.53308556944907, + "learning_rate": 7.370242214532871e-07, + "logits/chosen": -0.008343299850821495, + "logits/rejected": -0.03129954636096954, + "logps/accuracies": 0.75, + "logps/chosen": -356.82958984375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -344.5155029296875, + "logps/ref_rejected": -331.8990783691406, + "logps/rejected": -418.02227783203125, + "loss": 0.5789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2314121723175049, + "rewards/grad_term": 0.005676737520843744, + "rewards/margins": 7.380904197692871, + "rewards/rejected": -8.612316131591797, + "step": 326 + }, + { + "epoch": 0.6783352781019059, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 98.7994855905736, + "learning_rate": 7.358708189158016e-07, + "logits/chosen": 0.005753070116043091, + "logits/rejected": 0.01671770215034485, + "logps/accuracies": 0.625, + "logps/chosen": -313.5362243652344, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -289.57489013671875, + "logps/ref_rejected": -292.51513671875, + "logps/rejected": -378.51617431640625, + "loss": 0.595, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3961288928985596, + "rewards/grad_term": 0.017515743151307106, + "rewards/margins": 6.203976631164551, + "rewards/rejected": -8.600106239318848, + "step": 327 + }, + { + "epoch": 0.6804096979126151, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 46.182216579421194, + "learning_rate": 7.347174163783159e-07, + "logits/chosen": 0.44519540667533875, + "logits/rejected": 0.4516918659210205, + "logps/accuracies": 1.0, + "logps/chosen": -261.4315490722656, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -263.67852783203125, + "logps/ref_rejected": -263.0929260253906, + "logps/rejected": -354.0087890625, + "loss": 0.5502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22469913959503174, + "rewards/grad_term": 0.00040459661977365613, + "rewards/margins": 9.316282272338867, + "rewards/rejected": -9.091583251953125, + "step": 328 + }, + { + "epoch": 0.6824841177233243, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 57.57752345437062, + "learning_rate": 7.335640138408304e-07, + "logits/chosen": 0.35632041096687317, + "logits/rejected": 0.3070759177207947, + "logps/accuracies": 0.8125, + "logps/chosen": -300.12353515625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -303.2857360839844, + "logps/ref_rejected": -288.00970458984375, + "logps/rejected": -365.837646484375, + "loss": 0.6026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3162227272987366, + "rewards/grad_term": 0.007446423638612032, + "rewards/margins": 8.099015235900879, + "rewards/rejected": -7.782792568206787, + "step": 329 + }, + { + "epoch": 0.6845585375340334, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 53.06794587438548, + "learning_rate": 7.324106113033448e-07, + "logits/chosen": 0.08217829465866089, + "logits/rejected": 0.2244112640619278, + "logps/accuracies": 0.9375, + "logps/chosen": -315.64288330078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -322.9136962890625, + "logps/ref_rejected": -385.41436767578125, + "logps/rejected": -465.26690673828125, + "loss": 0.5569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7270775437355042, + "rewards/grad_term": 0.004581013694405556, + "rewards/margins": 8.71232795715332, + "rewards/rejected": -7.985250473022461, + "step": 330 + }, + { + "epoch": 0.6866329573447426, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 19.260377458337114, + "learning_rate": 7.312572087658593e-07, + "logits/chosen": 0.07186198234558105, + "logits/rejected": 0.11489441245794296, + "logps/accuracies": 0.8125, + "logps/chosen": -319.80755615234375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -325.8714599609375, + "logps/ref_rejected": -329.986572265625, + "logps/rejected": -402.0074462890625, + "loss": 0.5508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6063953042030334, + "rewards/grad_term": 0.006578431464731693, + "rewards/margins": 7.808480739593506, + "rewards/rejected": -7.202085971832275, + "step": 331 + }, + { + "epoch": 0.6887073771554518, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 17.756201694843515, + "learning_rate": 7.301038062283737e-07, + "logits/chosen": 0.20243048667907715, + "logits/rejected": 0.28428226709365845, + "logps/accuracies": 1.0, + "logps/chosen": -302.6923828125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -292.7161865234375, + "logps/ref_rejected": -302.3647766113281, + "logps/rejected": -387.39251708984375, + "loss": 0.5204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9976207613945007, + "rewards/grad_term": 0.008672392927110195, + "rewards/margins": 7.505157470703125, + "rewards/rejected": -8.502777099609375, + "step": 332 + }, + { + "epoch": 0.690781796966161, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 83.21722843296402, + "learning_rate": 7.289504036908881e-07, + "logits/chosen": 0.18854957818984985, + "logits/rejected": 0.13426542282104492, + "logps/accuracies": 0.6875, + "logps/chosen": -330.0719909667969, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -328.91522216796875, + "logps/ref_rejected": -307.397705078125, + "logps/rejected": -382.39984130859375, + "loss": 0.549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11567914485931396, + "rewards/grad_term": 0.004070833325386047, + "rewards/margins": 7.384533882141113, + "rewards/rejected": -7.500212669372559, + "step": 333 + }, + { + "epoch": 0.6928562167768703, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 41.93819396079496, + "learning_rate": 7.277970011534025e-07, + "logits/chosen": 0.0038331379182636738, + "logits/rejected": 0.06832897663116455, + "logps/accuracies": 0.9375, + "logps/chosen": -271.9588317871094, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -270.6300964355469, + "logps/ref_rejected": -264.27239990234375, + "logps/rejected": -333.4945373535156, + "loss": 0.5229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13287392258644104, + "rewards/grad_term": 0.004513449501246214, + "rewards/margins": 6.789344787597656, + "rewards/rejected": -6.922219276428223, + "step": 334 + }, + { + "epoch": 0.6949306365875794, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 36.44772897953967, + "learning_rate": 7.26643598615917e-07, + "logits/chosen": 0.08206385374069214, + "logits/rejected": 0.15132063627243042, + "logps/accuracies": 0.8125, + "logps/chosen": -281.35186767578125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -291.0823669433594, + "logps/ref_rejected": -298.27886962890625, + "logps/rejected": -367.3419494628906, + "loss": 0.5405, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9730521440505981, + "rewards/grad_term": 0.002131945453584194, + "rewards/margins": 7.879360198974609, + "rewards/rejected": -6.906307697296143, + "step": 335 + }, + { + "epoch": 0.6970050563982886, + "flips/correct->correct": 0.8125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 28.51646796947631, + "learning_rate": 7.254901960784313e-07, + "logits/chosen": 0.020516425371170044, + "logits/rejected": 0.06293690204620361, + "logps/accuracies": 0.9375, + "logps/chosen": -308.943359375, + "logps/ref_accuracies": 0.8125, + "logps/ref_chosen": -308.57928466796875, + "logps/ref_rejected": -345.8021545410156, + "logps/rejected": -399.0782470703125, + "loss": 0.5161, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03641030192375183, + "rewards/grad_term": 0.012177910655736923, + "rewards/margins": 5.291202068328857, + "rewards/rejected": -5.327611923217773, + "step": 336 + }, + { + "epoch": 0.6990794762089978, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 34.67482984022363, + "learning_rate": 7.243367935409458e-07, + "logits/chosen": 0.09479643404483795, + "logits/rejected": 0.13485944271087646, + "logps/accuracies": 0.8125, + "logps/chosen": -350.71783447265625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -344.1441955566406, + "logps/ref_rejected": -334.01727294921875, + "logps/rejected": -425.11920166015625, + "loss": 0.5386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6573646068572998, + "rewards/grad_term": 0.005361511372029781, + "rewards/margins": 8.452826499938965, + "rewards/rejected": -9.110189437866211, + "step": 337 + }, + { + "epoch": 0.701153896019707, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 92.66931260479595, + "learning_rate": 7.231833910034601e-07, + "logits/chosen": 0.16966593265533447, + "logits/rejected": 0.13631996512413025, + "logps/accuracies": 0.875, + "logps/chosen": -345.20770263671875, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -337.16741943359375, + "logps/ref_rejected": -317.2330017089844, + "logps/rejected": -405.1679992675781, + "loss": 0.5231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8040257096290588, + "rewards/grad_term": 0.003560500219464302, + "rewards/margins": 7.989476203918457, + "rewards/rejected": -8.793501853942871, + "step": 338 + }, + { + "epoch": 0.7032283158304162, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 28.389402918089036, + "learning_rate": 7.220299884659746e-07, + "logits/chosen": 0.17944695055484772, + "logits/rejected": 0.2976837456226349, + "logps/accuracies": 0.875, + "logps/chosen": -262.23980712890625, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -265.7380676269531, + "logps/ref_rejected": -294.07598876953125, + "logps/rejected": -336.75543212890625, + "loss": 0.5853, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.34982502460479736, + "rewards/grad_term": 0.018309494480490685, + "rewards/margins": 4.617773532867432, + "rewards/rejected": -4.267948150634766, + "step": 339 + }, + { + "epoch": 0.7053027356411253, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 33.29343285253659, + "learning_rate": 7.20876585928489e-07, + "logits/chosen": 0.042180366814136505, + "logits/rejected": 0.008254090324044228, + "logps/accuracies": 0.75, + "logps/chosen": -366.08233642578125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -362.78900146484375, + "logps/ref_rejected": -327.56585693359375, + "logps/rejected": -406.1036071777344, + "loss": 0.5219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3293338716030121, + "rewards/grad_term": 0.006429283879697323, + "rewards/margins": 7.524442672729492, + "rewards/rejected": -7.8537774085998535, + "step": 340 + }, + { + "epoch": 0.7073771554518345, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 38.22067938349734, + "learning_rate": 7.197231833910034e-07, + "logits/chosen": 0.1745857149362564, + "logits/rejected": 0.22055479884147644, + "logps/accuracies": 0.75, + "logps/chosen": -223.20620727539062, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -214.50759887695312, + "logps/ref_rejected": -225.8865509033203, + "logps/rejected": -280.5146179199219, + "loss": 0.5904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8698611259460449, + "rewards/grad_term": 0.020344514399766922, + "rewards/margins": 4.592945098876953, + "rewards/rejected": -5.46280574798584, + "step": 341 + }, + { + "epoch": 0.7094515752625438, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 29.038636096202257, + "learning_rate": 7.185697808535178e-07, + "logits/chosen": 0.14953972399234772, + "logits/rejected": 0.15320980548858643, + "logps/accuracies": 0.9375, + "logps/chosen": -289.46533203125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -288.3252868652344, + "logps/ref_rejected": -298.48101806640625, + "logps/rejected": -359.2103576660156, + "loss": 0.6004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11400707066059113, + "rewards/grad_term": 0.022022824734449387, + "rewards/margins": 5.958928108215332, + "rewards/rejected": -6.072935104370117, + "step": 342 + }, + { + "epoch": 0.711525995073253, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 33.00355372540172, + "learning_rate": 7.174163783160324e-07, + "logits/chosen": -0.06829185783863068, + "logits/rejected": -0.009546427056193352, + "logps/accuracies": 0.8125, + "logps/chosen": -352.6844177246094, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -354.55364990234375, + "logps/ref_rejected": -351.40203857421875, + "logps/rejected": -413.4746398925781, + "loss": 0.5504, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18692292273044586, + "rewards/grad_term": 0.01500310655683279, + "rewards/margins": 6.394184112548828, + "rewards/rejected": -6.207261085510254, + "step": 343 + }, + { + "epoch": 0.7136004148839621, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 29.545604358917398, + "learning_rate": 7.162629757785467e-07, + "logits/chosen": 0.21840424835681915, + "logits/rejected": 0.3245609402656555, + "logps/accuracies": 0.9375, + "logps/chosen": -289.7184143066406, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -281.7314758300781, + "logps/ref_rejected": -305.2322082519531, + "logps/rejected": -371.3037414550781, + "loss": 0.5546, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7986934185028076, + "rewards/grad_term": 0.02135634422302246, + "rewards/margins": 5.8084611892700195, + "rewards/rejected": -6.607154369354248, + "step": 344 + }, + { + "epoch": 0.7156748346946713, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 18.685102213495963, + "learning_rate": 7.151095732410612e-07, + "logits/chosen": 0.33599621057510376, + "logits/rejected": 0.259542852640152, + "logps/accuracies": 0.8125, + "logps/chosen": -297.86285400390625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -309.85284423828125, + "logps/ref_rejected": -319.9841613769531, + "logps/rejected": -364.2496643066406, + "loss": 0.5428, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1990000009536743, + "rewards/grad_term": 0.012635907158255577, + "rewards/margins": 5.625548839569092, + "rewards/rejected": -4.426548957824707, + "step": 345 + }, + { + "epoch": 0.7177492545053805, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 51.73399077401343, + "learning_rate": 7.139561707035755e-07, + "logits/chosen": 0.19686900079250336, + "logits/rejected": 0.2250552475452423, + "logps/accuracies": 0.875, + "logps/chosen": -334.487548828125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -344.34234619140625, + "logps/ref_rejected": -351.153564453125, + "logps/rejected": -423.1881103515625, + "loss": 0.5425, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9854794144630432, + "rewards/grad_term": 0.0029634374659508467, + "rewards/margins": 8.188934326171875, + "rewards/rejected": -7.203455924987793, + "step": 346 + }, + { + "epoch": 0.7198236743160897, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 36.992429397717665, + "learning_rate": 7.1280276816609e-07, + "logits/chosen": 0.3227195143699646, + "logits/rejected": 0.3476618230342865, + "logps/accuracies": 0.8125, + "logps/chosen": -283.4656066894531, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -281.3516540527344, + "logps/ref_rejected": -273.4134826660156, + "logps/rejected": -333.3074645996094, + "loss": 0.5596, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21139609813690186, + "rewards/grad_term": 0.013419999741017818, + "rewards/margins": 5.778001308441162, + "rewards/rejected": -5.9893975257873535, + "step": 347 + }, + { + "epoch": 0.721898094126799, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 41.16834515003933, + "learning_rate": 7.116493656286043e-07, + "logits/chosen": 0.125450000166893, + "logits/rejected": 0.17324930429458618, + "logps/accuracies": 0.8125, + "logps/chosen": -296.0699157714844, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -281.47979736328125, + "logps/ref_rejected": -311.7028503417969, + "logps/rejected": -370.5085144042969, + "loss": 0.5877, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4590116739273071, + "rewards/grad_term": 0.020582564175128937, + "rewards/margins": 4.421552658081055, + "rewards/rejected": -5.8805646896362305, + "step": 348 + }, + { + "epoch": 0.723972513937508, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 28.04679377044801, + "learning_rate": 7.104959630911188e-07, + "logits/chosen": 0.02426442876458168, + "logits/rejected": 0.030082188546657562, + "logps/accuracies": 0.875, + "logps/chosen": -329.852294921875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -335.4429016113281, + "logps/ref_rejected": -336.506103515625, + "logps/rejected": -406.5826416015625, + "loss": 0.5545, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5590592622756958, + "rewards/grad_term": 0.007747030816972256, + "rewards/margins": 7.566709995269775, + "rewards/rejected": -7.007650852203369, + "step": 349 + }, + { + "epoch": 0.7260469337482173, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 33.4741646395165, + "learning_rate": 7.093425605536332e-07, + "logits/chosen": 0.014736661687493324, + "logits/rejected": 0.02637672983109951, + "logps/accuracies": 1.0, + "logps/chosen": -317.66864013671875, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -317.7265930175781, + "logps/ref_rejected": -342.7004699707031, + "logps/rejected": -423.1243896484375, + "loss": 0.5797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005797684192657471, + "rewards/grad_term": 0.006446592975407839, + "rewards/margins": 8.048192977905273, + "rewards/rejected": -8.04239559173584, + "step": 350 + }, + { + "epoch": 0.7281213535589265, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 26.26791550365828, + "learning_rate": 7.081891580161476e-07, + "logits/chosen": -0.018278811126947403, + "logits/rejected": -0.055383071303367615, + "logps/accuracies": 0.875, + "logps/chosen": -334.0198974609375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -332.0812683105469, + "logps/ref_rejected": -322.0040588378906, + "logps/rejected": -413.3955078125, + "loss": 0.5075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19386102259159088, + "rewards/grad_term": 0.009265870787203312, + "rewards/margins": 8.945282936096191, + "rewards/rejected": -9.139144897460938, + "step": 351 + }, + { + "epoch": 0.7301957733696357, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 31.547111088022394, + "learning_rate": 7.07035755478662e-07, + "logits/chosen": 0.07085268199443817, + "logits/rejected": 0.11351241916418076, + "logps/accuracies": 0.875, + "logps/chosen": -323.64910888671875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -324.082275390625, + "logps/ref_rejected": -336.0316467285156, + "logps/rejected": -416.46917724609375, + "loss": 0.5288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04331676661968231, + "rewards/grad_term": 0.003617448266595602, + "rewards/margins": 8.087069511413574, + "rewards/rejected": -8.043752670288086, + "step": 352 + }, + { + "epoch": 0.7322701931803449, + "flips/correct->correct": 0.1875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 44.5849999823685, + "learning_rate": 7.058823529411765e-07, + "logits/chosen": 0.3111583888530731, + "logits/rejected": 0.2975752055644989, + "logps/accuracies": 0.75, + "logps/chosen": -315.43798828125, + "logps/ref_accuracies": 0.1875, + "logps/ref_chosen": -307.661865234375, + "logps/ref_rejected": -282.1680908203125, + "logps/rejected": -370.3470153808594, + "loss": 0.5453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.777613639831543, + "rewards/grad_term": 0.010531319305300713, + "rewards/margins": 8.040277481079102, + "rewards/rejected": -8.817892074584961, + "step": 353 + }, + { + "epoch": 0.734344612991054, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 42.86231892921175, + "learning_rate": 7.047289504036908e-07, + "logits/chosen": 0.2585771083831787, + "logits/rejected": 0.3336886465549469, + "logps/accuracies": 0.9375, + "logps/chosen": -327.9022216796875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -332.4975891113281, + "logps/ref_rejected": -343.6222839355469, + "logps/rejected": -417.87591552734375, + "loss": 0.5632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.459539532661438, + "rewards/grad_term": 0.006328054238110781, + "rewards/margins": 7.8848958015441895, + "rewards/rejected": -7.425356864929199, + "step": 354 + }, + { + "epoch": 0.7364190328017632, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 38.37684555724554, + "learning_rate": 7.035755478662053e-07, + "logits/chosen": 0.07023249566555023, + "logits/rejected": 0.09015891700983047, + "logps/accuracies": 0.8125, + "logps/chosen": -336.48968505859375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -324.7945861816406, + "logps/ref_rejected": -318.7522888183594, + "logps/rejected": -398.0233154296875, + "loss": 0.6105, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1695095300674438, + "rewards/grad_term": 0.01756344363093376, + "rewards/margins": 6.757594585418701, + "rewards/rejected": -7.927104473114014, + "step": 355 + }, + { + "epoch": 0.7384934526124725, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 35.08769051586877, + "learning_rate": 7.024221453287197e-07, + "logits/chosen": 0.07610762119293213, + "logits/rejected": 0.18170149624347687, + "logps/accuracies": 0.875, + "logps/chosen": -259.91943359375, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -262.623779296875, + "logps/ref_rejected": -292.6954040527344, + "logps/rejected": -350.150146484375, + "loss": 0.5485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27043378353118896, + "rewards/grad_term": 0.011945006437599659, + "rewards/margins": 6.015911102294922, + "rewards/rejected": -5.745476722717285, + "step": 356 + }, + { + "epoch": 0.7405678724231817, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 26.590182989230307, + "learning_rate": 7.012687427912342e-07, + "logits/chosen": -0.002248242497444153, + "logits/rejected": 0.0742294117808342, + "logps/accuracies": 0.8125, + "logps/chosen": -248.39425659179688, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -245.6069793701172, + "logps/ref_rejected": -279.5992736816406, + "logps/rejected": -343.59112548828125, + "loss": 0.522, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2787279188632965, + "rewards/grad_term": 0.017660701647400856, + "rewards/margins": 6.120457172393799, + "rewards/rejected": -6.399184703826904, + "step": 357 + }, + { + "epoch": 0.7426422922338909, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 29.97128814142722, + "learning_rate": 7.001153402537486e-07, + "logits/chosen": 0.05706937611103058, + "logits/rejected": 0.20207172632217407, + "logps/accuracies": 0.75, + "logps/chosen": -336.1678771972656, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -342.0132751464844, + "logps/ref_rejected": -353.97802734375, + "logps/rejected": -400.2590637207031, + "loss": 0.4873, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5845356583595276, + "rewards/grad_term": 0.012262849137187004, + "rewards/margins": 5.212644577026367, + "rewards/rejected": -4.628108978271484, + "step": 358 + }, + { + "epoch": 0.7447167120446, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 63.41932463356418, + "learning_rate": 6.98961937716263e-07, + "logits/chosen": 0.21743306517601013, + "logits/rejected": 0.2811052203178406, + "logps/accuracies": 0.75, + "logps/chosen": -293.9018249511719, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -303.6950988769531, + "logps/ref_rejected": -292.83917236328125, + "logps/rejected": -354.6083068847656, + "loss": 0.5279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9793245792388916, + "rewards/grad_term": 0.009044105187058449, + "rewards/margins": 7.156236171722412, + "rewards/rejected": -6.176911354064941, + "step": 359 + }, + { + "epoch": 0.7467911318553092, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 89.9026831204858, + "learning_rate": 6.978085351787774e-07, + "logits/chosen": 0.40088769793510437, + "logits/rejected": 0.40543943643569946, + "logps/accuracies": 0.8125, + "logps/chosen": -255.18496704101562, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -259.64080810546875, + "logps/ref_rejected": -265.6492919921875, + "logps/rejected": -336.0693359375, + "loss": 0.5575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.44558367133140564, + "rewards/grad_term": 0.01041356474161148, + "rewards/margins": 7.487587928771973, + "rewards/rejected": -7.042004108428955, + "step": 360 + }, + { + "epoch": 0.7488655516660184, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 22.748796947859415, + "learning_rate": 6.966551326412918e-07, + "logits/chosen": 0.31769663095474243, + "logits/rejected": 0.3735862970352173, + "logps/accuracies": 0.8125, + "logps/chosen": -305.60565185546875, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -306.97216796875, + "logps/ref_rejected": -295.53271484375, + "logps/rejected": -375.6974182128906, + "loss": 0.5312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1366504728794098, + "rewards/grad_term": 0.003756206249818206, + "rewards/margins": 8.153119087219238, + "rewards/rejected": -8.016468048095703, + "step": 361 + }, + { + "epoch": 0.7509399714767276, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 48.649828270109175, + "learning_rate": 6.955017301038062e-07, + "logits/chosen": -0.11286991089582443, + "logits/rejected": -0.08522382378578186, + "logps/accuracies": 0.6875, + "logps/chosen": -306.65692138671875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -299.77130126953125, + "logps/ref_rejected": -305.17205810546875, + "logps/rejected": -366.79888916015625, + "loss": 0.563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.688561201095581, + "rewards/grad_term": 0.015329258516430855, + "rewards/margins": 5.474117279052734, + "rewards/rejected": -6.162679195404053, + "step": 362 + }, + { + "epoch": 0.7530143912874367, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 44.582697450921174, + "learning_rate": 6.943483275663207e-07, + "logits/chosen": 0.16607432067394257, + "logits/rejected": 0.19322986900806427, + "logps/accuracies": 0.9375, + "logps/chosen": -250.20355224609375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -252.611572265625, + "logps/ref_rejected": -272.4234924316406, + "logps/rejected": -353.1186218261719, + "loss": 0.5006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24080336093902588, + "rewards/grad_term": 0.006381358951330185, + "rewards/margins": 8.310314178466797, + "rewards/rejected": -8.069511413574219, + "step": 363 + }, + { + "epoch": 0.755088811098146, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 34.80591547675486, + "learning_rate": 6.93194925028835e-07, + "logits/chosen": 0.19473493099212646, + "logits/rejected": 0.18632598221302032, + "logps/accuracies": 0.8125, + "logps/chosen": -256.6549987792969, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -255.31813049316406, + "logps/ref_rejected": -249.58592224121094, + "logps/rejected": -313.6597900390625, + "loss": 0.5498, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13368618488311768, + "rewards/grad_term": 0.011986999772489071, + "rewards/margins": 6.273699760437012, + "rewards/rejected": -6.407385349273682, + "step": 364 + }, + { + "epoch": 0.7571632309088552, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 41.34316036796179, + "learning_rate": 6.920415224913494e-07, + "logits/chosen": 0.1397414356470108, + "logits/rejected": 0.23811021447181702, + "logps/accuracies": 0.9375, + "logps/chosen": -315.8439636230469, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -314.4014892578125, + "logps/ref_rejected": -363.3650817871094, + "logps/rejected": -422.4331359863281, + "loss": 0.5638, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.14424461126327515, + "rewards/grad_term": 0.012570216320455074, + "rewards/margins": 5.762563228607178, + "rewards/rejected": -5.906806945800781, + "step": 365 + }, + { + "epoch": 0.7592376507195644, + "flips/correct->correct": 0.1875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 23.772256648413755, + "learning_rate": 6.908881199538638e-07, + "logits/chosen": 0.05051745846867561, + "logits/rejected": -0.02119167149066925, + "logps/accuracies": 0.9375, + "logps/chosen": -253.32345581054688, + "logps/ref_accuracies": 0.1875, + "logps/ref_chosen": -254.56048583984375, + "logps/ref_rejected": -240.97332763671875, + "logps/rejected": -324.1744079589844, + "loss": 0.5553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12370094656944275, + "rewards/grad_term": 0.0012932950630784035, + "rewards/margins": 8.443807601928711, + "rewards/rejected": -8.320106506347656, + "step": 366 + }, + { + "epoch": 0.7613120705302736, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 38.69831479632815, + "learning_rate": 6.897347174163782e-07, + "logits/chosen": 0.12788856029510498, + "logits/rejected": 0.16543559730052948, + "logps/accuracies": 0.9375, + "logps/chosen": -277.533447265625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -259.1481628417969, + "logps/ref_rejected": -260.858154296875, + "logps/rejected": -343.05718994140625, + "loss": 0.5869, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.838526725769043, + "rewards/grad_term": 0.012527183629572392, + "rewards/margins": 6.381375789642334, + "rewards/rejected": -8.219902992248535, + "step": 367 + }, + { + "epoch": 0.7633864903409827, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 35.55521896514245, + "learning_rate": 6.885813148788927e-07, + "logits/chosen": 0.15278108417987823, + "logits/rejected": 0.14702126383781433, + "logps/accuracies": 1.0, + "logps/chosen": -257.01214599609375, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -247.90902709960938, + "logps/ref_rejected": -265.93560791015625, + "logps/rejected": -346.7325134277344, + "loss": 0.5451, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9103094339370728, + "rewards/grad_term": 0.011162678711116314, + "rewards/margins": 7.169381618499756, + "rewards/rejected": -8.079690933227539, + "step": 368 + }, + { + "epoch": 0.7654609101516919, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 30.230401166275122, + "learning_rate": 6.874279123414071e-07, + "logits/chosen": 0.086149662733078, + "logits/rejected": 0.22511181235313416, + "logps/accuracies": 0.875, + "logps/chosen": -233.25259399414062, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -216.24533081054688, + "logps/ref_rejected": -227.90420532226562, + "logps/rejected": -305.6214599609375, + "loss": 0.569, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7007248401641846, + "rewards/grad_term": 0.013076627627015114, + "rewards/margins": 6.071000099182129, + "rewards/rejected": -7.771725177764893, + "step": 369 + }, + { + "epoch": 0.7675353299624011, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 52.22387916464415, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": 0.2996848225593567, + "logits/rejected": 0.31599316000938416, + "logps/accuracies": 0.875, + "logps/chosen": -269.8756408691406, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -261.0709533691406, + "logps/ref_rejected": -246.64968872070312, + "logps/rejected": -322.8077392578125, + "loss": 0.5479, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8804708123207092, + "rewards/grad_term": 0.01271949615329504, + "rewards/margins": 6.735333442687988, + "rewards/rejected": -7.615804195404053, + "step": 370 + }, + { + "epoch": 0.7696097497731104, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 63.7741156365506, + "learning_rate": 6.851211072664359e-07, + "logits/chosen": -0.01846727915108204, + "logits/rejected": -0.008946547284722328, + "logps/accuracies": 0.875, + "logps/chosen": -305.61639404296875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -300.2716064453125, + "logps/ref_rejected": -331.55859375, + "logps/rejected": -386.4937438964844, + "loss": 0.5752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5344789624214172, + "rewards/grad_term": 0.02246464043855667, + "rewards/margins": 4.959036350250244, + "rewards/rejected": -5.4935150146484375, + "step": 371 + }, + { + "epoch": 0.7716841695838196, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 64.45685095510555, + "learning_rate": 6.839677047289504e-07, + "logits/chosen": 0.08011619746685028, + "logits/rejected": 0.09146730601787567, + "logps/accuracies": 0.8125, + "logps/chosen": -345.562255859375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -346.65069580078125, + "logps/ref_rejected": -352.12396240234375, + "logps/rejected": -403.227294921875, + "loss": 0.572, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10884669423103333, + "rewards/grad_term": 0.013191865757107735, + "rewards/margins": 5.219181537628174, + "rewards/rejected": -5.110335350036621, + "step": 372 + }, + { + "epoch": 0.7737585893945287, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 39.59306558130555, + "learning_rate": 6.828143021914648e-07, + "logits/chosen": -0.12652695178985596, + "logits/rejected": -0.07933872938156128, + "logps/accuracies": 0.8125, + "logps/chosen": -299.42108154296875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -306.9295654296875, + "logps/ref_rejected": -296.8434143066406, + "logps/rejected": -359.88623046875, + "loss": 0.5548, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7508491277694702, + "rewards/grad_term": 0.005514204967767, + "rewards/margins": 7.0551300048828125, + "rewards/rejected": -6.304280757904053, + "step": 373 + }, + { + "epoch": 0.7758330092052379, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 76.23798525574978, + "learning_rate": 6.816608996539792e-07, + "logits/chosen": 0.12230158597230911, + "logits/rejected": 0.12023597955703735, + "logps/accuracies": 0.75, + "logps/chosen": -281.0087890625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -282.221923828125, + "logps/ref_rejected": -290.686767578125, + "logps/rejected": -355.41357421875, + "loss": 0.603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1213146299123764, + "rewards/grad_term": 0.011165942065417767, + "rewards/margins": 6.593995571136475, + "rewards/rejected": -6.472680568695068, + "step": 374 + }, + { + "epoch": 0.7779074290159471, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 34.99314645928879, + "learning_rate": 6.805074971164936e-07, + "logits/chosen": 0.15410278737545013, + "logits/rejected": 0.2643253803253174, + "logps/accuracies": 0.875, + "logps/chosen": -305.5322265625, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -313.14080810546875, + "logps/ref_rejected": -341.4920654296875, + "logps/rejected": -399.58892822265625, + "loss": 0.6245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7608582377433777, + "rewards/grad_term": 0.008731910958886147, + "rewards/margins": 6.570548057556152, + "rewards/rejected": -5.809689998626709, + "step": 375 + }, + { + "epoch": 0.7799818488266563, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 54.1532317962664, + "learning_rate": 6.79354094579008e-07, + "logits/chosen": 0.3877769708633423, + "logits/rejected": 0.35827726125717163, + "logps/accuracies": 0.6875, + "logps/chosen": -262.630126953125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -267.2193298339844, + "logps/ref_rejected": -227.17079162597656, + "logps/rejected": -268.9840087890625, + "loss": 0.6305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45892155170440674, + "rewards/grad_term": 0.014158925041556358, + "rewards/margins": 4.640246391296387, + "rewards/rejected": -4.1813249588012695, + "step": 376 + }, + { + "epoch": 0.7820562686373654, + "flips/correct->correct": 0.1875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 29.1706749528385, + "learning_rate": 6.782006920415224e-07, + "logits/chosen": 0.33914873003959656, + "logits/rejected": 0.2858618199825287, + "logps/accuracies": 0.6875, + "logps/chosen": -333.1984558105469, + "logps/ref_accuracies": 0.1875, + "logps/ref_chosen": -334.16839599609375, + "logps/ref_rejected": -299.3583679199219, + "logps/rejected": -365.6182556152344, + "loss": 0.6119, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0969928503036499, + "rewards/grad_term": 0.011358851566910744, + "rewards/margins": 6.722982883453369, + "rewards/rejected": -6.62598991394043, + "step": 377 + }, + { + "epoch": 0.7841306884480747, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 84.54685393998759, + "learning_rate": 6.770472895040369e-07, + "logits/chosen": 0.24667781591415405, + "logits/rejected": 0.2804810106754303, + "logps/accuracies": 0.6875, + "logps/chosen": -298.77734375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -290.1251220703125, + "logps/ref_rejected": -294.4027099609375, + "logps/rejected": -353.4877014160156, + "loss": 0.5833, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8652223348617554, + "rewards/grad_term": 0.013081303797662258, + "rewards/margins": 5.043279647827148, + "rewards/rejected": -5.908501625061035, + "step": 378 + }, + { + "epoch": 0.7862051082587839, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 70.28746618838973, + "learning_rate": 6.758938869665512e-07, + "logits/chosen": 0.13181552290916443, + "logits/rejected": 0.20094197988510132, + "logps/accuracies": 0.875, + "logps/chosen": -299.0951843261719, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -298.9969177246094, + "logps/ref_rejected": -297.56201171875, + "logps/rejected": -363.7420349121094, + "loss": 0.5688, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.009829364717006683, + "rewards/grad_term": 0.010300719179213047, + "rewards/margins": 6.60817289352417, + "rewards/rejected": -6.618002414703369, + "step": 379 + }, + { + "epoch": 0.7882795280694931, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 36.661344739750994, + "learning_rate": 6.747404844290657e-07, + "logits/chosen": 0.11381202936172485, + "logits/rejected": 0.27916306257247925, + "logps/accuracies": 0.75, + "logps/chosen": -311.8406982421875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -300.5757141113281, + "logps/ref_rejected": -303.3941345214844, + "logps/rejected": -374.7430419921875, + "loss": 0.5351, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1265006065368652, + "rewards/grad_term": 0.013736705295741558, + "rewards/margins": 6.008389949798584, + "rewards/rejected": -7.134890556335449, + "step": 380 + }, + { + "epoch": 0.7903539478802023, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 52.40416785212294, + "learning_rate": 6.735870818915801e-07, + "logits/chosen": 0.28354066610336304, + "logits/rejected": 0.3793669044971466, + "logps/accuracies": 0.9375, + "logps/chosen": -251.4017333984375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -241.32391357421875, + "logps/ref_rejected": -268.19512939453125, + "logps/rejected": -346.08892822265625, + "loss": 0.5578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0077815055847168, + "rewards/grad_term": 0.0052658445201814175, + "rewards/margins": 6.781601905822754, + "rewards/rejected": -7.789383411407471, + "step": 381 + }, + { + "epoch": 0.7924283676909114, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 30.49757789797264, + "learning_rate": 6.724336793540946e-07, + "logits/chosen": 0.4262790381908417, + "logits/rejected": 0.44936031103134155, + "logps/accuracies": 0.875, + "logps/chosen": -276.0723571777344, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -259.67919921875, + "logps/ref_rejected": -269.11407470703125, + "logps/rejected": -344.3211364746094, + "loss": 0.5833, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.639316201210022, + "rewards/grad_term": 0.017718428745865822, + "rewards/margins": 5.881390571594238, + "rewards/rejected": -7.520707130432129, + "step": 382 + }, + { + "epoch": 0.7945027875016206, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 32.850204291988575, + "learning_rate": 6.71280276816609e-07, + "logits/chosen": 0.41091158986091614, + "logits/rejected": 0.46820542216300964, + "logps/accuracies": 0.75, + "logps/chosen": -324.7238464355469, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -312.0603942871094, + "logps/ref_rejected": -325.2768859863281, + "logps/rejected": -401.4980773925781, + "loss": 0.6727, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2663447856903076, + "rewards/grad_term": 0.01295191328972578, + "rewards/margins": 6.35577392578125, + "rewards/rejected": -7.622118949890137, + "step": 383 + }, + { + "epoch": 0.7965772073123298, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 40.68382714512231, + "learning_rate": 6.701268742791234e-07, + "logits/chosen": -0.04504679515957832, + "logits/rejected": -0.05939174070954323, + "logps/accuracies": 0.5, + "logps/chosen": -364.9970703125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -330.1788635253906, + "logps/ref_rejected": -310.0872497558594, + "logps/rejected": -393.5850830078125, + "loss": 0.6752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.481823444366455, + "rewards/grad_term": 0.014739114791154861, + "rewards/margins": 4.8679633140563965, + "rewards/rejected": -8.349786758422852, + "step": 384 + }, + { + "epoch": 0.7965772073123298, + "eval_flips/correct->correct": 0.4433497488498688, + "eval_flips/correct->incorrect": 0.0, + "eval_flips/incorrect->correct": 0.3497537076473236, + "eval_flips/incorrect->incorrect": 0.2068965584039688, + "eval_logits/chosen": 0.1350509524345398, + "eval_logits/rejected": 0.17706024646759033, + "eval_logps/accuracies": 0.7931034564971924, + "eval_logps/chosen": -310.3870544433594, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -369.9229431152344, + "eval_loss": 0.6723487973213196, + "eval_rewards/accuracies": 0.9261083602905273, + "eval_rewards/chosen": -2.3035953044891357, + "eval_rewards/grad_term": 0.011555198580026627, + "eval_rewards/margins": 5.784095287322998, + "eval_rewards/rejected": -8.087691307067871, + "eval_runtime": 804.6111, + "eval_samples_per_second": 2.011, + "eval_steps_per_second": 0.252, + "step": 384 + }, + { + "epoch": 0.798651627123039, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 50.762610162394786, + "learning_rate": 6.689734717416378e-07, + "logits/chosen": -0.012154202908277512, + "logits/rejected": 0.0032455138862133026, + "logps/accuracies": 0.8125, + "logps/chosen": -352.6640930175781, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -328.68597412109375, + "logps/ref_rejected": -300.6054992675781, + "logps/rejected": -397.1305847167969, + "loss": 0.6399, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3978145122528076, + "rewards/grad_term": 0.0043524750508368015, + "rewards/margins": 7.254694938659668, + "rewards/rejected": -9.652509689331055, + "step": 385 + }, + { + "epoch": 0.8007260469337483, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 95.05673688341686, + "learning_rate": 6.678200692041522e-07, + "logits/chosen": 0.21177135407924652, + "logits/rejected": 0.23154297471046448, + "logps/accuracies": 0.875, + "logps/chosen": -336.6234130859375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -302.63336181640625, + "logps/ref_rejected": -325.64202880859375, + "logps/rejected": -405.45782470703125, + "loss": 0.6303, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.399005174636841, + "rewards/grad_term": 0.015561016276478767, + "rewards/margins": 4.582573413848877, + "rewards/rejected": -7.981578826904297, + "step": 386 + }, + { + "epoch": 0.8028004667444574, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.4375, + "grad_norm": 98.7595748405997, + "learning_rate": 6.666666666666666e-07, + "logits/chosen": -0.1825534999370575, + "logits/rejected": -0.13728323578834534, + "logps/accuracies": 0.5625, + "logps/chosen": -279.4841613769531, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -266.027099609375, + "logps/ref_rejected": -238.2875518798828, + "logps/rejected": -310.189453125, + "loss": 0.6502, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.345707893371582, + "rewards/grad_term": 0.013503390364348888, + "rewards/margins": 5.844482421875, + "rewards/rejected": -7.19019079208374, + "step": 387 + }, + { + "epoch": 0.8048748865551666, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 58.11377216377975, + "learning_rate": 6.655132641291811e-07, + "logits/chosen": 0.20901203155517578, + "logits/rejected": 0.19806969165802002, + "logps/accuracies": 0.9375, + "logps/chosen": -327.6300354003906, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -298.32843017578125, + "logps/ref_rejected": -294.7974853515625, + "logps/rejected": -393.97967529296875, + "loss": 0.6463, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9301586151123047, + "rewards/grad_term": 0.0037229093722999096, + "rewards/margins": 6.988059997558594, + "rewards/rejected": -9.918218612670898, + "step": 388 + }, + { + "epoch": 0.8069493063658758, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 23.205827934491982, + "learning_rate": 6.643598615916954e-07, + "logits/chosen": 0.18306072056293488, + "logits/rejected": 0.23532596230506897, + "logps/accuracies": 0.875, + "logps/chosen": -248.78665161132812, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -254.37738037109375, + "logps/ref_rejected": -251.6569061279297, + "logps/rejected": -319.8973388671875, + "loss": 0.5127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5590727925300598, + "rewards/grad_term": 0.001746954396367073, + "rewards/margins": 7.383120536804199, + "rewards/rejected": -6.824047088623047, + "step": 389 + }, + { + "epoch": 0.809023726176585, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 46.61067138619315, + "learning_rate": 6.632064590542099e-07, + "logits/chosen": 0.16985514760017395, + "logits/rejected": 0.16642533242702484, + "logps/accuracies": 0.9375, + "logps/chosen": -340.10723876953125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -344.4974670410156, + "logps/ref_rejected": -371.35791015625, + "logps/rejected": -436.1505432128906, + "loss": 0.5613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4390239119529724, + "rewards/grad_term": 0.007462616544216871, + "rewards/margins": 6.918284893035889, + "rewards/rejected": -6.47926139831543, + "step": 390 + }, + { + "epoch": 0.8110981459872941, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 57.80966589693944, + "learning_rate": 6.620530565167242e-07, + "logits/chosen": -0.10273560136556625, + "logits/rejected": -0.0613471083343029, + "logps/accuracies": 0.75, + "logps/chosen": -216.7875518798828, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -226.03538513183594, + "logps/ref_rejected": -221.71621704101562, + "logps/rejected": -261.552978515625, + "loss": 0.5161, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9247859120368958, + "rewards/grad_term": 0.0170612595975399, + "rewards/margins": 4.9084649085998535, + "rewards/rejected": -3.9836790561676025, + "step": 391 + }, + { + "epoch": 0.8131725657980033, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 62.6241872293078, + "learning_rate": 6.608996539792387e-07, + "logits/chosen": 0.22374431788921356, + "logits/rejected": 0.22435928881168365, + "logps/accuracies": 0.625, + "logps/chosen": -285.15252685546875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -289.751220703125, + "logps/ref_rejected": -287.3389587402344, + "logps/rejected": -332.70794677734375, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4598711133003235, + "rewards/grad_term": 0.01684059388935566, + "rewards/margins": 4.996764659881592, + "rewards/rejected": -4.536893844604492, + "step": 392 + }, + { + "epoch": 0.8152469856087126, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 104.53799642819773, + "learning_rate": 6.597462514417531e-07, + "logits/chosen": 0.1355782002210617, + "logits/rejected": 0.15115031599998474, + "logps/accuracies": 0.6875, + "logps/chosen": -267.6117858886719, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -278.65167236328125, + "logps/ref_rejected": -268.8196105957031, + "logps/rejected": -310.05419921875, + "loss": 0.655, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1039892435073853, + "rewards/grad_term": 0.0200988557189703, + "rewards/margins": 5.227451801300049, + "rewards/rejected": -4.123462677001953, + "step": 393 + }, + { + "epoch": 0.8173214054194218, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5625, + "grad_norm": 91.5589589431653, + "learning_rate": 6.585928489042676e-07, + "logits/chosen": 0.05219127982854843, + "logits/rejected": 0.1293550282716751, + "logps/accuracies": 0.4375, + "logps/chosen": -287.01702880859375, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -297.4679870605469, + "logps/ref_rejected": -308.44219970703125, + "logps/rejected": -331.50775146484375, + "loss": 0.6269, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0450924634933472, + "rewards/grad_term": 0.0215632114559412, + "rewards/margins": 3.3516530990600586, + "rewards/rejected": -2.306560516357422, + "step": 394 + }, + { + "epoch": 0.819395825230131, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 80.60637705363116, + "learning_rate": 6.57439446366782e-07, + "logits/chosen": -0.04064434394240379, + "logits/rejected": -0.011088773608207703, + "logps/accuracies": 0.75, + "logps/chosen": -245.4797821044922, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -250.2657928466797, + "logps/ref_rejected": -278.16424560546875, + "logps/rejected": -307.46978759765625, + "loss": 0.6575, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4786025285720825, + "rewards/grad_term": 0.023240692913532257, + "rewards/margins": 3.4091572761535645, + "rewards/rejected": -2.9305543899536133, + "step": 395 + }, + { + "epoch": 0.8214702450408401, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 37.32684776835478, + "learning_rate": 6.562860438292964e-07, + "logits/chosen": 0.10297183692455292, + "logits/rejected": 0.11840492486953735, + "logps/accuracies": 0.625, + "logps/chosen": -298.5022888183594, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -309.21044921875, + "logps/ref_rejected": -305.84539794921875, + "logps/rejected": -328.775390625, + "loss": 0.6662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0708208084106445, + "rewards/grad_term": 0.022083457559347153, + "rewards/margins": 3.363819122314453, + "rewards/rejected": -2.2929983139038086, + "step": 396 + }, + { + "epoch": 0.8235446648515493, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 23.575297499080293, + "learning_rate": 6.551326412918108e-07, + "logits/chosen": 0.11109241843223572, + "logits/rejected": 0.11409325897693634, + "logps/accuracies": 0.9375, + "logps/chosen": -276.38092041015625, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -285.5079345703125, + "logps/ref_rejected": -288.4066162109375, + "logps/rejected": -335.4505920410156, + "loss": 0.5911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9127010107040405, + "rewards/grad_term": 0.014403178356587887, + "rewards/margins": 5.617100715637207, + "rewards/rejected": -4.704399585723877, + "step": 397 + }, + { + "epoch": 0.8256190846622585, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 46.15593222404285, + "learning_rate": 6.539792387543253e-07, + "logits/chosen": 0.09000806510448456, + "logits/rejected": 0.09876266866922379, + "logps/accuracies": 0.75, + "logps/chosen": -270.1310119628906, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -274.2320556640625, + "logps/ref_rejected": -265.1705322265625, + "logps/rejected": -315.0960693359375, + "loss": 0.5498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41010797023773193, + "rewards/grad_term": 0.010970378294587135, + "rewards/margins": 5.402661323547363, + "rewards/rejected": -4.992552757263184, + "step": 398 + }, + { + "epoch": 0.8276935044729677, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 44.74586930025993, + "learning_rate": 6.528258362168396e-07, + "logits/chosen": 0.26735085248947144, + "logits/rejected": 0.30994755029678345, + "logps/accuracies": 0.875, + "logps/chosen": -246.11720275878906, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -253.5211181640625, + "logps/ref_rejected": -260.9595031738281, + "logps/rejected": -306.35498046875, + "loss": 0.5491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7403918504714966, + "rewards/grad_term": 0.01072466466575861, + "rewards/margins": 5.279941558837891, + "rewards/rejected": -4.539549827575684, + "step": 399 + }, + { + "epoch": 0.829767924283677, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 52.75963999398655, + "learning_rate": 6.516724336793541e-07, + "logits/chosen": 0.40581169724464417, + "logits/rejected": 0.43723931908607483, + "logps/accuracies": 0.8125, + "logps/chosen": -302.2900085449219, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -299.8786926269531, + "logps/ref_rejected": -311.3181457519531, + "logps/rejected": -364.59326171875, + "loss": 0.5293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24113261699676514, + "rewards/grad_term": 0.017243320122361183, + "rewards/margins": 5.086377143859863, + "rewards/rejected": -5.32750940322876, + "step": 400 + }, + { + "epoch": 0.8318423440943861, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 75.29899473684678, + "learning_rate": 6.505190311418684e-07, + "logits/chosen": -0.09569695591926575, + "logits/rejected": -0.0767926424741745, + "logps/accuracies": 0.75, + "logps/chosen": -301.2300109863281, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -292.5848388671875, + "logps/ref_rejected": -290.6342468261719, + "logps/rejected": -361.4559326171875, + "loss": 0.5411, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8645120859146118, + "rewards/grad_term": 0.014016557484865189, + "rewards/margins": 6.21765661239624, + "rewards/rejected": -7.0821685791015625, + "step": 401 + }, + { + "epoch": 0.8339167639050953, + "flips/correct->correct": 0.1875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.625, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 16.000137065905, + "learning_rate": 6.493656286043829e-07, + "logits/chosen": 0.13318368792533875, + "logits/rejected": 0.1401294469833374, + "logps/accuracies": 0.8125, + "logps/chosen": -306.74395751953125, + "logps/ref_accuracies": 0.1875, + "logps/ref_chosen": -315.7019958496094, + "logps/ref_rejected": -279.975341796875, + "logps/rejected": -362.96795654296875, + "loss": 0.5658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8958021998405457, + "rewards/grad_term": 0.0026785405352711678, + "rewards/margins": 9.195062637329102, + "rewards/rejected": -8.299260139465332, + "step": 402 + }, + { + "epoch": 0.8359911837158045, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 39.13628890660203, + "learning_rate": 6.482122260668973e-07, + "logits/chosen": 0.4807916283607483, + "logits/rejected": 0.6536089181900024, + "logps/accuracies": 0.8125, + "logps/chosen": -300.46356201171875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -272.149658203125, + "logps/ref_rejected": -331.09649658203125, + "logps/rejected": -420.2705078125, + "loss": 0.5903, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.831390142440796, + "rewards/grad_term": 0.010690869763493538, + "rewards/margins": 6.086010932922363, + "rewards/rejected": -8.917401313781738, + "step": 403 + }, + { + "epoch": 0.8380656035265137, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 30.654540085811046, + "learning_rate": 6.470588235294117e-07, + "logits/chosen": 0.22139021754264832, + "logits/rejected": 0.2569182515144348, + "logps/accuracies": 0.75, + "logps/chosen": -349.45458984375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -313.68267822265625, + "logps/ref_rejected": -315.2803955078125, + "logps/rejected": -405.5434265136719, + "loss": 0.6411, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5771892070770264, + "rewards/grad_term": 0.005928752478212118, + "rewards/margins": 5.449113368988037, + "rewards/rejected": -9.0263032913208, + "step": 404 + }, + { + "epoch": 0.8401400233372228, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 45.0761906012067, + "learning_rate": 6.459054209919261e-07, + "logits/chosen": 0.35096606612205505, + "logits/rejected": 0.44806602597236633, + "logps/accuracies": 0.8125, + "logps/chosen": -223.75851440429688, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -213.4886474609375, + "logps/ref_rejected": -226.67689514160156, + "logps/rejected": -300.3026428222656, + "loss": 0.7055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.026986002922058, + "rewards/grad_term": 0.006491546984761953, + "rewards/margins": 6.3355865478515625, + "rewards/rejected": -7.36257266998291, + "step": 405 + }, + { + "epoch": 0.842214443147932, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 26.11429132364704, + "learning_rate": 6.447520184544407e-07, + "logits/chosen": 0.051110029220581055, + "logits/rejected": 0.10619683563709259, + "logps/accuracies": 0.875, + "logps/chosen": -313.3158874511719, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -292.2518310546875, + "logps/ref_rejected": -291.42193603515625, + "logps/rejected": -382.94781494140625, + "loss": 0.7203, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1064045429229736, + "rewards/grad_term": 0.006023161578923464, + "rewards/margins": 7.046186447143555, + "rewards/rejected": -9.15259075164795, + "step": 406 + }, + { + "epoch": 0.8442888629586413, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 49.70258596298242, + "learning_rate": 6.43598615916955e-07, + "logits/chosen": 0.2751619219779968, + "logits/rejected": 0.2619101107120514, + "logps/accuracies": 0.875, + "logps/chosen": -298.23443603515625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -270.52728271484375, + "logps/ref_rejected": -262.9530029296875, + "logps/rejected": -359.8721618652344, + "loss": 0.6965, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7707149982452393, + "rewards/grad_term": 0.008775541558861732, + "rewards/margins": 6.921198844909668, + "rewards/rejected": -9.691913604736328, + "step": 407 + }, + { + "epoch": 0.8463632827693505, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.375, + "grad_norm": 76.86823523264724, + "learning_rate": 6.424452133794695e-07, + "logits/chosen": 0.10307708382606506, + "logits/rejected": 0.0942949503660202, + "logps/accuracies": 0.625, + "logps/chosen": -341.54669189453125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -313.2756652832031, + "logps/ref_rejected": -308.5162658691406, + "logps/rejected": -385.50421142578125, + "loss": 0.7027, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.827104330062866, + "rewards/grad_term": 0.01655164361000061, + "rewards/margins": 4.871689796447754, + "rewards/rejected": -7.698794364929199, + "step": 408 + }, + { + "epoch": 0.8484377025800597, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 59.950272206470274, + "learning_rate": 6.412918108419838e-07, + "logits/chosen": 0.019011177122592926, + "logits/rejected": 0.09038500487804413, + "logps/accuracies": 0.9375, + "logps/chosen": -308.01776123046875, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -263.8279113769531, + "logps/ref_rejected": -277.70489501953125, + "logps/rejected": -373.0634765625, + "loss": 0.6458, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.418985366821289, + "rewards/grad_term": 0.01191724929958582, + "rewards/margins": 5.116873741149902, + "rewards/rejected": -9.535858154296875, + "step": 409 + }, + { + "epoch": 0.8505121223907688, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 60.54011532087722, + "learning_rate": 6.401384083044983e-07, + "logits/chosen": 0.105913445353508, + "logits/rejected": 0.05668123438954353, + "logps/accuracies": 0.9375, + "logps/chosen": -325.8265380859375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -328.54022216796875, + "logps/ref_rejected": -309.3156433105469, + "logps/rejected": -414.94476318359375, + "loss": 0.5845, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27136293053627014, + "rewards/grad_term": 6.003907765261829e-05, + "rewards/margins": 10.834280014038086, + "rewards/rejected": -10.56291675567627, + "step": 410 + }, + { + "epoch": 0.852586542201478, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 47.192528321568695, + "learning_rate": 6.389850057670127e-07, + "logits/chosen": 0.24323594570159912, + "logits/rejected": 0.2931632995605469, + "logps/accuracies": 0.75, + "logps/chosen": -285.09918212890625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -271.9576721191406, + "logps/ref_rejected": -269.5434875488281, + "logps/rejected": -350.0928955078125, + "loss": 0.6445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3141498565673828, + "rewards/grad_term": 0.012051810510456562, + "rewards/margins": 6.740789413452148, + "rewards/rejected": -8.054939270019531, + "step": 411 + }, + { + "epoch": 0.8546609620121872, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 58.581221597188936, + "learning_rate": 6.378316032295271e-07, + "logits/chosen": -0.11936801671981812, + "logits/rejected": -0.12492658197879791, + "logps/accuracies": 0.9375, + "logps/chosen": -338.5908508300781, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -339.50506591796875, + "logps/ref_rejected": -328.51324462890625, + "logps/rejected": -415.5992126464844, + "loss": 0.5409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0914229154586792, + "rewards/grad_term": 0.006874611601233482, + "rewards/margins": 8.800016403198242, + "rewards/rejected": -8.708593368530273, + "step": 412 + }, + { + "epoch": 0.8567353818228964, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 31.837832566947164, + "learning_rate": 6.366782006920415e-07, + "logits/chosen": 0.43914633989334106, + "logits/rejected": 0.5609852075576782, + "logps/accuracies": 0.875, + "logps/chosen": -258.90252685546875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -257.27178955078125, + "logps/ref_rejected": -292.46502685546875, + "logps/rejected": -361.5235900878906, + "loss": 0.5556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16307339072227478, + "rewards/grad_term": 0.015529593452811241, + "rewards/margins": 6.742788791656494, + "rewards/rejected": -6.905861854553223, + "step": 413 + }, + { + "epoch": 0.8588098016336057, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 130.10385493391817, + "learning_rate": 6.355247981545559e-07, + "logits/chosen": 0.33855926990509033, + "logits/rejected": 0.37952950596809387, + "logps/accuracies": 0.875, + "logps/chosen": -364.90386962890625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -364.53131103515625, + "logps/ref_rejected": -361.88043212890625, + "logps/rejected": -423.7973937988281, + "loss": 0.5026, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.03725364804267883, + "rewards/grad_term": 0.014217305928468704, + "rewards/margins": 6.1544389724731445, + "rewards/rejected": -6.191693305969238, + "step": 414 + }, + { + "epoch": 0.8608842214443148, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0625, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 72.11770229856765, + "learning_rate": 6.343713956170703e-07, + "logits/chosen": 0.40989670157432556, + "logits/rejected": 0.4901154935359955, + "logps/accuracies": 0.6875, + "logps/chosen": -218.7454833984375, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -221.65101623535156, + "logps/ref_rejected": -227.98141479492188, + "logps/rejected": -275.6513366699219, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29055318236351013, + "rewards/grad_term": 0.013483730144798756, + "rewards/margins": 5.057545185089111, + "rewards/rejected": -4.766992092132568, + "step": 415 + }, + { + "epoch": 0.862958641255024, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 51.89358785836724, + "learning_rate": 6.332179930795848e-07, + "logits/chosen": -0.05566471815109253, + "logits/rejected": 0.030616842210292816, + "logps/accuracies": 0.6875, + "logps/chosen": -316.4223327636719, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -327.1333312988281, + "logps/ref_rejected": -320.0599670410156, + "logps/rejected": -366.65240478515625, + "loss": 0.6136, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0711019039154053, + "rewards/grad_term": 0.016874371096491814, + "rewards/margins": 5.730344295501709, + "rewards/rejected": -4.659242153167725, + "step": 416 + }, + { + "epoch": 0.8650330610657332, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 38.31866291679492, + "learning_rate": 6.320645905420991e-07, + "logits/chosen": 0.1870647817850113, + "logits/rejected": 0.18267706036567688, + "logps/accuracies": 0.75, + "logps/chosen": -360.9079895019531, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -376.5132751464844, + "logps/ref_rejected": -365.2701721191406, + "logps/rejected": -400.93572998046875, + "loss": 0.591, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.56052565574646, + "rewards/grad_term": 0.011596291325986385, + "rewards/margins": 5.127077579498291, + "rewards/rejected": -3.5665524005889893, + "step": 417 + }, + { + "epoch": 0.8671074808764424, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 92.98916391894235, + "learning_rate": 6.309111880046136e-07, + "logits/chosen": 0.03672199696302414, + "logits/rejected": 0.061092860996723175, + "logps/accuracies": 0.8125, + "logps/chosen": -270.1746520996094, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -281.4916076660156, + "logps/ref_rejected": -294.692626953125, + "logps/rejected": -329.16668701171875, + "loss": 0.6474, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1316967010498047, + "rewards/grad_term": 0.014774792827665806, + "rewards/margins": 4.579105377197266, + "rewards/rejected": -3.447408437728882, + "step": 418 + }, + { + "epoch": 0.8691819006871515, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 23.063566639192924, + "learning_rate": 6.29757785467128e-07, + "logits/chosen": 0.3395993113517761, + "logits/rejected": 0.38536539673805237, + "logps/accuracies": 0.8125, + "logps/chosen": -286.3059997558594, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -296.46038818359375, + "logps/ref_rejected": -286.7039489746094, + "logps/rejected": -349.8249206542969, + "loss": 0.6349, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0154372453689575, + "rewards/grad_term": 0.009252113290131092, + "rewards/margins": 7.3275322914123535, + "rewards/rejected": -6.312095642089844, + "step": 419 + }, + { + "epoch": 0.8712563204978607, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 44.948117340095266, + "learning_rate": 6.286043829296425e-07, + "logits/chosen": -0.0751362144947052, + "logits/rejected": -0.003658019006252289, + "logps/accuracies": 0.75, + "logps/chosen": -268.517333984375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -265.34674072265625, + "logps/ref_rejected": -274.8442077636719, + "logps/rejected": -334.3121643066406, + "loss": 0.6457, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3170568645000458, + "rewards/grad_term": 0.017749693244695663, + "rewards/margins": 5.629739284515381, + "rewards/rejected": -5.94679594039917, + "step": 420 + }, + { + "epoch": 0.8733307403085699, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 34.9886652815262, + "learning_rate": 6.274509803921569e-07, + "logits/chosen": 0.3326599597930908, + "logits/rejected": 0.3828299343585968, + "logps/accuracies": 0.875, + "logps/chosen": -316.2264404296875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -327.22039794921875, + "logps/ref_rejected": -343.5686950683594, + "logps/rejected": -394.07080078125, + "loss": 0.5868, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0993949174880981, + "rewards/grad_term": 0.012708110734820366, + "rewards/margins": 6.149601936340332, + "rewards/rejected": -5.050206661224365, + "step": 421 + }, + { + "epoch": 0.8754051601192792, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 17.44095726007292, + "learning_rate": 6.262975778546713e-07, + "logits/chosen": 0.06078142672777176, + "logits/rejected": -0.015550296753644943, + "logps/accuracies": 0.9375, + "logps/chosen": -322.6217346191406, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -336.0628356933594, + "logps/ref_rejected": -319.92767333984375, + "logps/rejected": -387.3755798339844, + "loss": 0.5532, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.344112515449524, + "rewards/grad_term": 0.007527807727456093, + "rewards/margins": 8.088907241821289, + "rewards/rejected": -6.7447943687438965, + "step": 422 + }, + { + "epoch": 0.8774795799299884, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.625, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 65.08075817623737, + "learning_rate": 6.251441753171857e-07, + "logits/chosen": 0.06982388347387314, + "logits/rejected": 0.007165290415287018, + "logps/accuracies": 1.0, + "logps/chosen": -300.95513916015625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -306.9526062011719, + "logps/ref_rejected": -288.5630798339844, + "logps/rejected": -370.71649169921875, + "loss": 0.5573, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5997495651245117, + "rewards/grad_term": 0.006430043373256922, + "rewards/margins": 8.815089225769043, + "rewards/rejected": -8.215339660644531, + "step": 423 + }, + { + "epoch": 0.8795539997406975, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 24.805163857445233, + "learning_rate": 6.239907727797001e-07, + "logits/chosen": 0.22340461611747742, + "logits/rejected": 0.21236705780029297, + "logps/accuracies": 0.75, + "logps/chosen": -313.9115295410156, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -315.0428161621094, + "logps/ref_rejected": -296.20806884765625, + "logps/rejected": -360.9374694824219, + "loss": 0.5634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11312799155712128, + "rewards/grad_term": 0.00796779990196228, + "rewards/margins": 6.586068153381348, + "rewards/rejected": -6.472940444946289, + "step": 424 + }, + { + "epoch": 0.8816284195514067, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 76.72911496113723, + "learning_rate": 6.228373702422145e-07, + "logits/chosen": 0.14682908356189728, + "logits/rejected": 0.15023761987686157, + "logps/accuracies": 0.75, + "logps/chosen": -277.3023681640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -261.1825256347656, + "logps/ref_rejected": -256.98846435546875, + "logps/rejected": -323.0633239746094, + "loss": 0.5797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6119821071624756, + "rewards/grad_term": 0.017880305647850037, + "rewards/margins": 4.995503902435303, + "rewards/rejected": -6.607484817504883, + "step": 425 + }, + { + "epoch": 0.8837028393621159, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 59.32877504186551, + "learning_rate": 6.21683967704729e-07, + "logits/chosen": 0.11907504498958588, + "logits/rejected": 0.10917734354734421, + "logps/accuracies": 0.8125, + "logps/chosen": -280.1270446777344, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -282.9510192871094, + "logps/ref_rejected": -269.74761962890625, + "logps/rejected": -354.8128662109375, + "loss": 0.535, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28239575028419495, + "rewards/grad_term": 0.011113264597952366, + "rewards/margins": 8.788921356201172, + "rewards/rejected": -8.506525993347168, + "step": 426 + }, + { + "epoch": 0.8857772591728251, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 127.10327511506759, + "learning_rate": 6.205305651672433e-07, + "logits/chosen": 0.17048220336437225, + "logits/rejected": 0.16041475534439087, + "logps/accuracies": 0.9375, + "logps/chosen": -294.7939758300781, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -294.1711730957031, + "logps/ref_rejected": -291.6072082519531, + "logps/rejected": -370.58660888671875, + "loss": 0.5691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.062279731035232544, + "rewards/grad_term": 0.0018367553129792213, + "rewards/margins": 7.835660934448242, + "rewards/rejected": -7.8979411125183105, + "step": 427 + }, + { + "epoch": 0.8878516789835343, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 54.660911786291244, + "learning_rate": 6.193771626297578e-07, + "logits/chosen": 0.036782991141080856, + "logits/rejected": 0.06632021814584732, + "logps/accuracies": 0.8125, + "logps/chosen": -297.9986572265625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -289.60797119140625, + "logps/ref_rejected": -254.0615234375, + "logps/rejected": -336.6078796386719, + "loss": 0.5262, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8390707969665527, + "rewards/grad_term": 0.00769506860524416, + "rewards/margins": 7.415563583374023, + "rewards/rejected": -8.254634857177734, + "step": 428 + }, + { + "epoch": 0.8899260987942434, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 50.959026978469176, + "learning_rate": 6.182237600922721e-07, + "logits/chosen": 0.2548729181289673, + "logits/rejected": 0.24063560366630554, + "logps/accuracies": 0.9375, + "logps/chosen": -358.7782287597656, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -354.93792724609375, + "logps/ref_rejected": -339.633544921875, + "logps/rejected": -428.53302001953125, + "loss": 0.5172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3840335011482239, + "rewards/grad_term": 0.003113335929811001, + "rewards/margins": 8.505916595458984, + "rewards/rejected": -8.8899507522583, + "step": 429 + }, + { + "epoch": 0.8920005186049527, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 29.304522529635086, + "learning_rate": 6.170703575547866e-07, + "logits/chosen": 0.11386538296937943, + "logits/rejected": 0.1519451141357422, + "logps/accuracies": 0.8125, + "logps/chosen": -247.40721130371094, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -234.81825256347656, + "logps/ref_rejected": -226.5369110107422, + "logps/rejected": -298.44622802734375, + "loss": 0.6175, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2588945627212524, + "rewards/grad_term": 0.014798032119870186, + "rewards/margins": 5.93203592300415, + "rewards/rejected": -7.1909308433532715, + "step": 430 + }, + { + "epoch": 0.8940749384156619, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 33.20014491905227, + "learning_rate": 6.159169550173011e-07, + "logits/chosen": 0.29805174469947815, + "logits/rejected": 0.33232244849205017, + "logps/accuracies": 0.8125, + "logps/chosen": -336.54693603515625, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -328.70526123046875, + "logps/ref_rejected": -330.7543029785156, + "logps/rejected": -422.9801330566406, + "loss": 0.5606, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7841659784317017, + "rewards/grad_term": 0.006449039559811354, + "rewards/margins": 8.438421249389648, + "rewards/rejected": -9.222586631774902, + "step": 431 + }, + { + "epoch": 0.8961493582263711, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 30.864555897425223, + "learning_rate": 6.147635524798154e-07, + "logits/chosen": 0.12058807164430618, + "logits/rejected": 0.13604214787483215, + "logps/accuracies": 0.9375, + "logps/chosen": -287.39691162109375, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -291.1800842285156, + "logps/ref_rejected": -289.5397644042969, + "logps/rejected": -376.7043151855469, + "loss": 0.5595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37831762433052063, + "rewards/grad_term": 0.0001301583251915872, + "rewards/margins": 9.094771385192871, + "rewards/rejected": -8.716453552246094, + "step": 432 + }, + { + "epoch": 0.8982237780370802, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 48.56405217889427, + "learning_rate": 6.136101499423299e-07, + "logits/chosen": 0.3865184783935547, + "logits/rejected": 0.4680458903312683, + "logps/accuracies": 0.75, + "logps/chosen": -285.0791015625, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -283.8902587890625, + "logps/ref_rejected": -293.55670166015625, + "logps/rejected": -369.1327209472656, + "loss": 0.5467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1188843846321106, + "rewards/grad_term": 0.006384614389389753, + "rewards/margins": 7.438718795776367, + "rewards/rejected": -7.557602882385254, + "step": 433 + }, + { + "epoch": 0.9002981978477894, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 41.55010194963713, + "learning_rate": 6.124567474048442e-07, + "logits/chosen": 0.37142789363861084, + "logits/rejected": 0.4116554856300354, + "logps/accuracies": 0.6875, + "logps/chosen": -264.35888671875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -255.59439086914062, + "logps/ref_rejected": -254.5562744140625, + "logps/rejected": -320.098388671875, + "loss": 0.5228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8764491081237793, + "rewards/grad_term": 0.009346509352326393, + "rewards/margins": 5.6777663230896, + "rewards/rejected": -6.5542144775390625, + "step": 434 + }, + { + "epoch": 0.9023726176584986, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 41.02709525636894, + "learning_rate": 6.113033448673587e-07, + "logits/chosen": -0.03402477130293846, + "logits/rejected": 0.10666719824075699, + "logps/accuracies": 0.9375, + "logps/chosen": -330.32745361328125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -331.1935729980469, + "logps/ref_rejected": -342.39508056640625, + "logps/rejected": -409.2434997558594, + "loss": 0.5445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08661463856697083, + "rewards/grad_term": 0.011018088087439537, + "rewards/margins": 6.771457672119141, + "rewards/rejected": -6.684843063354492, + "step": 435 + }, + { + "epoch": 0.9044470374692078, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 46.61949118855806, + "learning_rate": 6.101499423298731e-07, + "logits/chosen": 0.033953070640563965, + "logits/rejected": 0.005475502926856279, + "logps/accuracies": 0.6875, + "logps/chosen": -304.1283874511719, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -310.3152160644531, + "logps/ref_rejected": -276.940185546875, + "logps/rejected": -349.3773193359375, + "loss": 0.5265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6186816692352295, + "rewards/grad_term": 0.0051316795870661736, + "rewards/margins": 7.862398147583008, + "rewards/rejected": -7.243716239929199, + "step": 436 + }, + { + "epoch": 0.9065214572799171, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 55.50940525398979, + "learning_rate": 6.089965397923875e-07, + "logits/chosen": 0.14566786587238312, + "logits/rejected": 0.14087940752506256, + "logps/accuracies": 0.875, + "logps/chosen": -307.6397705078125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -306.0394592285156, + "logps/ref_rejected": -290.89068603515625, + "logps/rejected": -374.4580078125, + "loss": 0.5268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1600308120250702, + "rewards/grad_term": 0.003625539131462574, + "rewards/margins": 8.196700096130371, + "rewards/rejected": -8.356730461120605, + "step": 437 + }, + { + "epoch": 0.9085958770906262, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 36.429819834561926, + "learning_rate": 6.078431372549019e-07, + "logits/chosen": 0.17382624745368958, + "logits/rejected": 0.17122478783130646, + "logps/accuracies": 0.9375, + "logps/chosen": -268.6485595703125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -274.4229736328125, + "logps/ref_rejected": -260.6642761230469, + "logps/rejected": -344.1253662109375, + "loss": 0.5025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5774391889572144, + "rewards/grad_term": 0.0027016454841941595, + "rewards/margins": 8.923548698425293, + "rewards/rejected": -8.346110343933105, + "step": 438 + }, + { + "epoch": 0.9106702969013354, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 33.56713984591113, + "learning_rate": 6.066897347174163e-07, + "logits/chosen": 0.09694240987300873, + "logits/rejected": 0.22976186871528625, + "logps/accuracies": 0.9375, + "logps/chosen": -258.59161376953125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -264.9013977050781, + "logps/ref_rejected": -288.00958251953125, + "logps/rejected": -353.9190673828125, + "loss": 0.5424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6309794187545776, + "rewards/grad_term": 0.008919828571379185, + "rewards/margins": 7.221925258636475, + "rewards/rejected": -6.590945720672607, + "step": 439 + }, + { + "epoch": 0.9127447167120446, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.125, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 22.84688441463557, + "learning_rate": 6.055363321799307e-07, + "logits/chosen": -0.12110434472560883, + "logits/rejected": -0.06775850802659988, + "logps/accuracies": 0.625, + "logps/chosen": -252.79736328125, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -241.74078369140625, + "logps/ref_rejected": -250.46945190429688, + "logps/rejected": -318.03790283203125, + "loss": 0.5855, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1056597232818604, + "rewards/grad_term": 0.011472761631011963, + "rewards/margins": 5.651185989379883, + "rewards/rejected": -6.756845951080322, + "step": 440 + }, + { + "epoch": 0.9148191365227538, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 37.379916915616754, + "learning_rate": 6.043829296424452e-07, + "logits/chosen": 0.10848057270050049, + "logits/rejected": 0.11391180008649826, + "logps/accuracies": 0.8125, + "logps/chosen": -311.4754943847656, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -313.4561462402344, + "logps/ref_rejected": -308.49609375, + "logps/rejected": -389.3127136230469, + "loss": 0.5174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19806542992591858, + "rewards/grad_term": 0.007313254754990339, + "rewards/margins": 8.279730796813965, + "rewards/rejected": -8.0816650390625, + "step": 441 + }, + { + "epoch": 0.916893556333463, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 29.537088633715857, + "learning_rate": 6.032295271049595e-07, + "logits/chosen": -0.04151641204953194, + "logits/rejected": -0.03607035428285599, + "logps/accuracies": 0.875, + "logps/chosen": -309.4384765625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -318.02105712890625, + "logps/ref_rejected": -330.68695068359375, + "logps/rejected": -403.2144775390625, + "loss": 0.5329, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8582614064216614, + "rewards/grad_term": 0.007493661250919104, + "rewards/margins": 8.111011505126953, + "rewards/rejected": -7.252751350402832, + "step": 442 + }, + { + "epoch": 0.9189679761441721, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 27.777171215232052, + "learning_rate": 6.02076124567474e-07, + "logits/chosen": 0.2931877374649048, + "logits/rejected": 0.2972795367240906, + "logps/accuracies": 0.75, + "logps/chosen": -301.7099609375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -301.0013122558594, + "logps/ref_rejected": -281.2795104980469, + "logps/rejected": -338.0366516113281, + "loss": 0.564, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07086822390556335, + "rewards/grad_term": 0.012654304504394531, + "rewards/margins": 5.604846954345703, + "rewards/rejected": -5.67571496963501, + "step": 443 + }, + { + "epoch": 0.9210423959548814, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 26.705457998840874, + "learning_rate": 6.009227220299884e-07, + "logits/chosen": 0.24231280386447906, + "logits/rejected": 0.25109824538230896, + "logps/accuracies": 0.8125, + "logps/chosen": -255.10946655273438, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -262.9153747558594, + "logps/ref_rejected": -290.76556396484375, + "logps/rejected": -346.1953430175781, + "loss": 0.5406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7805902361869812, + "rewards/grad_term": 0.00831932295113802, + "rewards/margins": 6.32357120513916, + "rewards/rejected": -5.542980670928955, + "step": 444 + }, + { + "epoch": 0.9231168157655906, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 45.863702043985306, + "learning_rate": 5.997693194925029e-07, + "logits/chosen": 0.30657637119293213, + "logits/rejected": 0.40025636553764343, + "logps/accuracies": 0.6875, + "logps/chosen": -378.9405517578125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -375.9232482910156, + "logps/ref_rejected": -406.2908935546875, + "logps/rejected": -476.7547912597656, + "loss": 0.4917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30173051357269287, + "rewards/grad_term": 0.007800333667546511, + "rewards/margins": 6.744661808013916, + "rewards/rejected": -7.046392440795898, + "step": 445 + }, + { + "epoch": 0.9251912355762998, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 58.839145890605785, + "learning_rate": 5.986159169550173e-07, + "logits/chosen": 0.2454492151737213, + "logits/rejected": 0.2175568789243698, + "logps/accuracies": 0.875, + "logps/chosen": -282.2199401855469, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -279.5404968261719, + "logps/ref_rejected": -255.2272186279297, + "logps/rejected": -328.5665283203125, + "loss": 0.4951, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2679447829723358, + "rewards/grad_term": 0.009870468638837337, + "rewards/margins": 7.065983295440674, + "rewards/rejected": -7.333928108215332, + "step": 446 + }, + { + "epoch": 0.9272656553870089, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 45.27476270388639, + "learning_rate": 5.974625144175317e-07, + "logits/chosen": 0.25637272000312805, + "logits/rejected": 0.284266859292984, + "logps/accuracies": 0.8125, + "logps/chosen": -308.41693115234375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -299.62353515625, + "logps/ref_rejected": -297.4239807128906, + "logps/rejected": -373.3579406738281, + "loss": 0.5676, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8793376088142395, + "rewards/grad_term": 0.01210443302989006, + "rewards/margins": 6.714059829711914, + "rewards/rejected": -7.5933966636657715, + "step": 447 + }, + { + "epoch": 0.9293400751977181, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 24.468747270838374, + "learning_rate": 5.963091118800461e-07, + "logits/chosen": -0.047158196568489075, + "logits/rejected": -0.004086131229996681, + "logps/accuracies": 0.8125, + "logps/chosen": -363.8712158203125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -358.2686462402344, + "logps/ref_rejected": -370.7213439941406, + "logps/rejected": -445.895263671875, + "loss": 0.5564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5602600574493408, + "rewards/grad_term": 0.010198243893682957, + "rewards/margins": 6.957134246826172, + "rewards/rejected": -7.517394065856934, + "step": 448 + }, + { + "epoch": 0.9293400751977181, + "eval_flips/correct->correct": 0.4433497488498688, + "eval_flips/correct->incorrect": 0.0, + "eval_flips/incorrect->correct": 0.37438422441482544, + "eval_flips/incorrect->incorrect": 0.1822660118341446, + "eval_logits/chosen": 0.12454497069120407, + "eval_logits/rejected": 0.16565194725990295, + "eval_logps/accuracies": 0.8177340030670166, + "eval_logps/chosen": -297.1930847167969, + "eval_logps/ref_accuracies": 0.4433497488498688, + "eval_logps/ref_chosen": -287.3511047363281, + "eval_logps/ref_rejected": -289.0460205078125, + "eval_logps/rejected": -360.1458740234375, + "eval_loss": 0.5838693976402283, + "eval_rewards/accuracies": 0.9113300442695618, + "eval_rewards/chosen": -0.9841962456703186, + "eval_rewards/grad_term": 0.01190107874572277, + "eval_rewards/margins": 6.125789165496826, + "eval_rewards/rejected": -7.1099853515625, + "eval_runtime": 804.5696, + "eval_samples_per_second": 2.011, + "eval_steps_per_second": 0.252, + "step": 448 + }, + { + "epoch": 0.9314144950084273, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 42.40430094029457, + "learning_rate": 5.951557093425605e-07, + "logits/chosen": 0.2953071594238281, + "logits/rejected": 0.32737353444099426, + "logps/accuracies": 0.75, + "logps/chosen": -246.87020874023438, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -241.10800170898438, + "logps/ref_rejected": -247.88864135742188, + "logps/rejected": -314.1338806152344, + "loss": 0.5558, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5762210488319397, + "rewards/grad_term": 0.016563208773732185, + "rewards/margins": 6.0483012199401855, + "rewards/rejected": -6.6245222091674805, + "step": 449 + }, + { + "epoch": 0.9334889148191365, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 55.56936800158885, + "learning_rate": 5.940023068050749e-07, + "logits/chosen": -0.24604183435440063, + "logits/rejected": -0.20258383452892303, + "logps/accuracies": 0.9375, + "logps/chosen": -280.2922058105469, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -271.5796813964844, + "logps/ref_rejected": -273.0106506347656, + "logps/rejected": -346.3686828613281, + "loss": 0.5873, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8712505102157593, + "rewards/grad_term": 0.010869830846786499, + "rewards/margins": 6.4645562171936035, + "rewards/rejected": -7.335805892944336, + "step": 450 + }, + { + "epoch": 0.9355633346298458, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 104.79125724883293, + "learning_rate": 5.928489042675894e-07, + "logits/chosen": 0.24167490005493164, + "logits/rejected": 0.2851963937282562, + "logps/accuracies": 0.9375, + "logps/chosen": -315.26214599609375, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -313.26177978515625, + "logps/ref_rejected": -285.7829895019531, + "logps/rejected": -376.867919921875, + "loss": 0.5462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20003750920295715, + "rewards/grad_term": 0.0005122160073369741, + "rewards/margins": 8.908455848693848, + "rewards/rejected": -9.10849380493164, + "step": 451 + }, + { + "epoch": 0.9376377544405549, + "flips/correct->correct": 0.6875, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 29.758835328305054, + "learning_rate": 5.916955017301037e-07, + "logits/chosen": 0.2663220167160034, + "logits/rejected": 0.3874686658382416, + "logps/accuracies": 0.875, + "logps/chosen": -262.48992919921875, + "logps/ref_accuracies": 0.6875, + "logps/ref_chosen": -257.1368408203125, + "logps/ref_rejected": -272.6324462890625, + "logps/rejected": -335.6940612792969, + "loss": 0.5535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5353102684020996, + "rewards/grad_term": 0.008673110976815224, + "rewards/margins": 5.770854473114014, + "rewards/rejected": -6.3061652183532715, + "step": 452 + }, + { + "epoch": 0.9397121742512641, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 17.42639637388175, + "learning_rate": 5.905420991926182e-07, + "logits/chosen": 0.2760721743106842, + "logits/rejected": 0.3189687430858612, + "logps/accuracies": 0.75, + "logps/chosen": -275.108642578125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -274.26934814453125, + "logps/ref_rejected": -267.088134765625, + "logps/rejected": -331.2291564941406, + "loss": 0.4576, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0839340090751648, + "rewards/grad_term": 0.01929977536201477, + "rewards/margins": 6.330172538757324, + "rewards/rejected": -6.414106369018555, + "step": 453 + }, + { + "epoch": 0.9417865940619733, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 43.82809699832689, + "learning_rate": 5.893886966551325e-07, + "logits/chosen": 0.21667756140232086, + "logits/rejected": 0.20864138007164001, + "logps/accuracies": 0.6875, + "logps/chosen": -275.2607727050781, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -276.90924072265625, + "logps/ref_rejected": -287.8323974609375, + "logps/rejected": -336.0874328613281, + "loss": 0.5327, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16484564542770386, + "rewards/grad_term": 0.018737200647592545, + "rewards/margins": 4.990347862243652, + "rewards/rejected": -4.825502395629883, + "step": 454 + }, + { + "epoch": 0.9438610138726825, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 38.23253448586923, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": 0.11684215813875198, + "logits/rejected": 0.24031777679920197, + "logps/accuracies": 0.9375, + "logps/chosen": -313.59710693359375, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -317.1141052246094, + "logps/ref_rejected": -372.68316650390625, + "logps/rejected": -439.81866455078125, + "loss": 0.5429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35169944167137146, + "rewards/grad_term": 0.00929531641304493, + "rewards/margins": 7.065249919891357, + "rewards/rejected": -6.713550567626953, + "step": 455 + }, + { + "epoch": 0.9459354336833917, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5625, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 56.76327899023629, + "learning_rate": 5.870818915801614e-07, + "logits/chosen": 0.10899796336889267, + "logits/rejected": 0.17427489161491394, + "logps/accuracies": 0.9375, + "logps/chosen": -296.0990295410156, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -301.2550048828125, + "logps/ref_rejected": -292.7115173339844, + "logps/rejected": -371.61187744140625, + "loss": 0.5333, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5155962705612183, + "rewards/grad_term": 0.00969706755131483, + "rewards/margins": 8.405632972717285, + "rewards/rejected": -7.890036582946777, + "step": 456 + }, + { + "epoch": 0.9480098534941008, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 55.72280578194114, + "learning_rate": 5.859284890426759e-07, + "logits/chosen": -0.003691728226840496, + "logits/rejected": 0.0035413503646850586, + "logps/accuracies": 0.8125, + "logps/chosen": -304.0773010253906, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -317.5519104003906, + "logps/ref_rejected": -319.302978515625, + "logps/rejected": -378.9814453125, + "loss": 0.5376, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.347464919090271, + "rewards/grad_term": 0.011599891819059849, + "rewards/margins": 7.315312385559082, + "rewards/rejected": -5.9678473472595215, + "step": 457 + }, + { + "epoch": 0.95008427330481, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 63.66588058556034, + "learning_rate": 5.847750865051903e-07, + "logits/chosen": -0.25413990020751953, + "logits/rejected": -0.17748790979385376, + "logps/accuracies": 0.9375, + "logps/chosen": -318.7485656738281, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -321.310546875, + "logps/ref_rejected": -325.7279052734375, + "logps/rejected": -377.45843505859375, + "loss": 0.5057, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2561964988708496, + "rewards/grad_term": 0.013258688151836395, + "rewards/margins": 5.429249286651611, + "rewards/rejected": -5.173052787780762, + "step": 458 + }, + { + "epoch": 0.9521586931155193, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 29.584209944027126, + "learning_rate": 5.836216839677048e-07, + "logits/chosen": -0.028666552156209946, + "logits/rejected": 0.027672436088323593, + "logps/accuracies": 0.75, + "logps/chosen": -297.3187561035156, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -300.38385009765625, + "logps/ref_rejected": -299.2121276855469, + "logps/rejected": -364.8226623535156, + "loss": 0.464, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3065095543861389, + "rewards/grad_term": 0.009040933102369308, + "rewards/margins": 6.867563247680664, + "rewards/rejected": -6.561053276062012, + "step": 459 + }, + { + "epoch": 0.9542331129262285, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 36.21237401329387, + "learning_rate": 5.824682814302191e-07, + "logits/chosen": 0.010993116535246372, + "logits/rejected": 0.10759762674570084, + "logps/accuracies": 0.8125, + "logps/chosen": -247.78744506835938, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -243.45068359375, + "logps/ref_rejected": -238.94656372070312, + "logps/rejected": -306.70025634765625, + "loss": 0.5576, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4336736798286438, + "rewards/grad_term": 0.010049809701740742, + "rewards/margins": 6.341697692871094, + "rewards/rejected": -6.775371551513672, + "step": 460 + }, + { + "epoch": 0.9563075327369377, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 67.07396455833435, + "learning_rate": 5.813148788927336e-07, + "logits/chosen": 0.2826724350452423, + "logits/rejected": 0.30516886711120605, + "logps/accuracies": 0.9375, + "logps/chosen": -303.901611328125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -310.4884948730469, + "logps/ref_rejected": -321.95538330078125, + "logps/rejected": -394.9017639160156, + "loss": 0.5389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6586897373199463, + "rewards/grad_term": 0.007842399179935455, + "rewards/margins": 7.953330993652344, + "rewards/rejected": -7.294641971588135, + "step": 461 + }, + { + "epoch": 0.9583819525476468, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 29.22489915607507, + "learning_rate": 5.801614763552479e-07, + "logits/chosen": 0.12667125463485718, + "logits/rejected": 0.24145105481147766, + "logps/accuracies": 1.0, + "logps/chosen": -228.75924682617188, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -230.16534423828125, + "logps/ref_rejected": -276.22015380859375, + "logps/rejected": -355.9960632324219, + "loss": 0.5172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14060965180397034, + "rewards/grad_term": 0.003057720372453332, + "rewards/margins": 8.118200302124023, + "rewards/rejected": -7.977591037750244, + "step": 462 + }, + { + "epoch": 0.960456372358356, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 46.6463409683984, + "learning_rate": 5.790080738177624e-07, + "logits/chosen": 0.22849154472351074, + "logits/rejected": 0.2621627748012543, + "logps/accuracies": 0.875, + "logps/chosen": -352.41827392578125, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -345.7464294433594, + "logps/ref_rejected": -329.25946044921875, + "logps/rejected": -404.5802001953125, + "loss": 0.4936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6671811938285828, + "rewards/grad_term": 0.010429211892187595, + "rewards/margins": 6.864894866943359, + "rewards/rejected": -7.532076835632324, + "step": 463 + }, + { + "epoch": 0.9625307921690652, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 23.56747059673345, + "learning_rate": 5.778546712802767e-07, + "logits/chosen": 0.07257233560085297, + "logits/rejected": 0.10124337673187256, + "logps/accuracies": 0.9375, + "logps/chosen": -296.21209716796875, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -301.79473876953125, + "logps/ref_rejected": -295.86785888671875, + "logps/rejected": -370.70001220703125, + "loss": 0.5419, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.558262288570404, + "rewards/grad_term": 0.007962905801832676, + "rewards/margins": 8.041479110717773, + "rewards/rejected": -7.483217239379883, + "step": 464 + }, + { + "epoch": 0.9646052119797744, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 32.887206960447294, + "learning_rate": 5.767012687427912e-07, + "logits/chosen": -0.02765033021569252, + "logits/rejected": -0.04159718379378319, + "logps/accuracies": 0.875, + "logps/chosen": -301.87750244140625, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -294.6784973144531, + "logps/ref_rejected": -315.06695556640625, + "logps/rejected": -396.1573486328125, + "loss": 0.5015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7198995351791382, + "rewards/grad_term": 0.013939508236944675, + "rewards/margins": 7.389136791229248, + "rewards/rejected": -8.109036445617676, + "step": 465 + }, + { + "epoch": 0.9666796317904836, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 31.89130120157542, + "learning_rate": 5.755478662053056e-07, + "logits/chosen": 0.06474259495735168, + "logits/rejected": 0.1507914811372757, + "logps/accuracies": 0.75, + "logps/chosen": -362.8743896484375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -358.84747314453125, + "logps/ref_rejected": -336.5288391113281, + "logps/rejected": -405.94775390625, + "loss": 0.4804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4026913344860077, + "rewards/grad_term": 0.012680365703999996, + "rewards/margins": 6.539196014404297, + "rewards/rejected": -6.941887378692627, + "step": 466 + }, + { + "epoch": 0.9687540516011928, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.0625, + "grad_norm": 24.005282908584242, + "learning_rate": 5.7439446366782e-07, + "logits/chosen": 0.46883174777030945, + "logits/rejected": 0.5026016235351562, + "logps/accuracies": 0.9375, + "logps/chosen": -274.5486145019531, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -275.224853515625, + "logps/ref_rejected": -294.5133361816406, + "logps/rejected": -363.38482666015625, + "loss": 0.5302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.06762228161096573, + "rewards/grad_term": 0.0132514713332057, + "rewards/margins": 6.954771041870117, + "rewards/rejected": -6.887148857116699, + "step": 467 + }, + { + "epoch": 0.970828471411902, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 93.90250048693143, + "learning_rate": 5.732410611303344e-07, + "logits/chosen": 0.13799475133419037, + "logits/rejected": 0.10960017144680023, + "logps/accuracies": 0.6875, + "logps/chosen": -306.63336181640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -298.33892822265625, + "logps/ref_rejected": -294.1246032714844, + "logps/rejected": -364.4803466796875, + "loss": 0.558, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8294415473937988, + "rewards/grad_term": 0.011318499222397804, + "rewards/margins": 6.206131458282471, + "rewards/rejected": -7.0355730056762695, + "step": 468 + }, + { + "epoch": 0.9729028912226112, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 33.22451982924039, + "learning_rate": 5.72087658592849e-07, + "logits/chosen": 0.2051057368516922, + "logits/rejected": 0.39599326252937317, + "logps/accuracies": 0.875, + "logps/chosen": -297.212890625, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -296.35797119140625, + "logps/ref_rejected": -341.98760986328125, + "logps/rejected": -415.61602783203125, + "loss": 0.577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08549117296934128, + "rewards/grad_term": 0.008653431199491024, + "rewards/margins": 7.277350902557373, + "rewards/rejected": -7.362841606140137, + "step": 469 + }, + { + "epoch": 0.9749773110333204, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 55.27297145431258, + "learning_rate": 5.709342560553633e-07, + "logits/chosen": 0.406170129776001, + "logits/rejected": 0.4695666432380676, + "logps/accuracies": 0.75, + "logps/chosen": -267.87518310546875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -264.7390441894531, + "logps/ref_rejected": -286.3910217285156, + "logps/rejected": -358.07012939453125, + "loss": 0.4829, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3136145770549774, + "rewards/grad_term": 0.010346844792366028, + "rewards/margins": 6.854294776916504, + "rewards/rejected": -7.167908668518066, + "step": 470 + }, + { + "epoch": 0.9770517308440295, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 46.76169337104463, + "learning_rate": 5.697808535178778e-07, + "logits/chosen": 0.021602880209684372, + "logits/rejected": 0.10054953396320343, + "logps/accuracies": 0.4375, + "logps/chosen": -277.87420654296875, + "logps/ref_accuracies": 0.3125, + "logps/ref_chosen": -278.6906433105469, + "logps/ref_rejected": -260.7660827636719, + "logps/rejected": -313.3102722167969, + "loss": 0.523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08164243400096893, + "rewards/grad_term": 0.015251345932483673, + "rewards/margins": 5.336061954498291, + "rewards/rejected": -5.254419326782227, + "step": 471 + }, + { + "epoch": 0.9791261506547387, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.3125, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 22.784427030938893, + "learning_rate": 5.686274509803921e-07, + "logits/chosen": 0.40202221274375916, + "logits/rejected": 0.47440534830093384, + "logps/accuracies": 0.875, + "logps/chosen": -262.98046875, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -269.6217956542969, + "logps/ref_rejected": -294.6548767089844, + "logps/rejected": -360.1872253417969, + "loss": 0.5311, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6641333103179932, + "rewards/grad_term": 0.008491192013025284, + "rewards/margins": 7.217367172241211, + "rewards/rejected": -6.553234100341797, + "step": 472 + }, + { + "epoch": 0.981200570465448, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.3125, + "grad_norm": 17.893894514454058, + "learning_rate": 5.674740484429066e-07, + "logits/chosen": 0.2951053977012634, + "logits/rejected": 0.31693655252456665, + "logps/accuracies": 0.6875, + "logps/chosen": -262.5206298828125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -264.88336181640625, + "logps/ref_rejected": -253.90045166015625, + "logps/rejected": -300.7421875, + "loss": 0.5814, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23627310991287231, + "rewards/grad_term": 0.016665775328874588, + "rewards/margins": 4.920448303222656, + "rewards/rejected": -4.684175491333008, + "step": 473 + }, + { + "epoch": 0.9832749902761572, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 43.55914775192543, + "learning_rate": 5.66320645905421e-07, + "logits/chosen": -0.024594342336058617, + "logits/rejected": 0.07602076232433319, + "logps/accuracies": 0.75, + "logps/chosen": -239.41683959960938, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -248.70411682128906, + "logps/ref_rejected": -289.31231689453125, + "logps/rejected": -342.78070068359375, + "loss": 0.6143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9287264347076416, + "rewards/grad_term": 0.012531589716672897, + "rewards/margins": 6.275561332702637, + "rewards/rejected": -5.346835136413574, + "step": 474 + }, + { + "epoch": 0.9853494100868664, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 62.70114024432645, + "learning_rate": 5.651672433679354e-07, + "logits/chosen": 0.10585808008909225, + "logits/rejected": 0.11864355206489563, + "logps/accuracies": 0.8125, + "logps/chosen": -296.0154724121094, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -307.2314453125, + "logps/ref_rejected": -307.9908752441406, + "logps/rejected": -372.58465576171875, + "loss": 0.5211, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1215964555740356, + "rewards/grad_term": 0.0117443036288023, + "rewards/margins": 7.5809736251831055, + "rewards/rejected": -6.459376811981201, + "step": 475 + }, + { + "epoch": 0.9874238298975755, + "flips/correct->correct": 0.375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.1875, + "grad_norm": 49.433246174946895, + "learning_rate": 5.640138408304498e-07, + "logits/chosen": -0.0011881794780492783, + "logits/rejected": 0.056654639542102814, + "logps/accuracies": 0.8125, + "logps/chosen": -287.03497314453125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -299.6839904785156, + "logps/ref_rejected": -300.8826904296875, + "logps/rejected": -363.1770935058594, + "loss": 0.5397, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2649037837982178, + "rewards/grad_term": 0.011040883138775826, + "rewards/margins": 7.494347095489502, + "rewards/rejected": -6.229443550109863, + "step": 476 + }, + { + "epoch": 0.9894982497082847, + "flips/correct->correct": 0.5625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.1875, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 37.13699226629539, + "learning_rate": 5.628604382929642e-07, + "logits/chosen": 0.22437655925750732, + "logits/rejected": 0.2689896821975708, + "logps/accuracies": 0.75, + "logps/chosen": -257.7532958984375, + "logps/ref_accuracies": 0.5625, + "logps/ref_chosen": -264.4325256347656, + "logps/ref_rejected": -272.7551574707031, + "logps/rejected": -302.99462890625, + "loss": 0.56, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.667922854423523, + "rewards/grad_term": 0.02357163466513157, + "rewards/margins": 3.6918697357177734, + "rewards/rejected": -3.02394700050354, + "step": 477 + }, + { + "epoch": 0.9915726695189939, + "flips/correct->correct": 0.4375, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.4375, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 58.00284238737268, + "learning_rate": 5.617070357554786e-07, + "logits/chosen": 0.23218779265880585, + "logits/rejected": 0.20929032564163208, + "logps/accuracies": 0.875, + "logps/chosen": -320.3446350097656, + "logps/ref_accuracies": 0.4375, + "logps/ref_chosen": -321.2319641113281, + "logps/ref_rejected": -327.994140625, + "logps/rejected": -400.4293212890625, + "loss": 0.5079, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08873284608125687, + "rewards/grad_term": 0.011475574225187302, + "rewards/margins": 7.332255840301514, + "rewards/rejected": -7.243522644042969, + "step": 478 + }, + { + "epoch": 0.9936470893297031, + "flips/correct->correct": 0.3125, + "flips/correct->incorrect": 0.0625, + "flips/incorrect->correct": 0.375, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 30.26855049657639, + "learning_rate": 5.605536332179931e-07, + "logits/chosen": 0.20646262168884277, + "logits/rejected": 0.18982850015163422, + "logps/accuracies": 0.6875, + "logps/chosen": -338.36700439453125, + "logps/ref_accuracies": 0.375, + "logps/ref_chosen": -340.2660217285156, + "logps/ref_rejected": -332.7535705566406, + "logps/rejected": -394.439697265625, + "loss": 0.535, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1898985505104065, + "rewards/grad_term": 0.012958088889718056, + "rewards/margins": 6.358510971069336, + "rewards/rejected": -6.168612957000732, + "step": 479 + }, + { + "epoch": 0.9957215091404122, + "flips/correct->correct": 0.625, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.125, + "grad_norm": 19.142226432349567, + "learning_rate": 5.594002306805074e-07, + "logits/chosen": 0.26075422763824463, + "logits/rejected": 0.3115725517272949, + "logps/accuracies": 0.875, + "logps/chosen": -263.19952392578125, + "logps/ref_accuracies": 0.625, + "logps/ref_chosen": -252.8601531982422, + "logps/ref_rejected": -255.4573516845703, + "logps/rejected": -332.5611267089844, + "loss": 0.5524, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.033937931060791, + "rewards/grad_term": 0.014926022849977016, + "rewards/margins": 6.676440715789795, + "rewards/rejected": -7.710378170013428, + "step": 480 + } + ], + "logging_steps": 1, + "max_steps": 964, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 96, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}