{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 51, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -3.131476879119873, "debug/policy_chosen_logps": -223.49798583984375, "debug/policy_rejected_logits": -3.0218234062194824, "debug/policy_rejected_logps": -181.94036865234375, "debug/reference_chosen_logps": -223.49798583984375, "debug/reference_rejected_logps": -181.94036865234375, "epoch": 0.0196078431372549, "grad_norm": 9.59268936350444, "learning_rate": 1e-06, "logits/chosen": -3.131476879119873, "logits/rejected": -3.0218234062194824, "logps/chosen": -223.49798583984375, "logps/rejected": -181.94036865234375, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -3.1443662643432617, "debug/policy_chosen_logps": -209.216552734375, "debug/policy_rejected_logits": -3.076768159866333, "debug/policy_rejected_logps": -170.2884521484375, "debug/reference_chosen_logps": -209.07872009277344, "debug/reference_rejected_logps": -169.68731689453125, "epoch": 0.0392156862745098, "grad_norm": 8.71018223332819, "learning_rate": 1e-06, "logits/chosen": -3.1443662643432617, "logits/rejected": -3.076768159866333, "logps/chosen": -209.216552734375, "logps/rejected": -170.2884521484375, "loss": 0.4974, "rewards/accuracies": 0.875, "rewards/chosen": -0.001378459855914116, "rewards/margins": 0.004632873460650444, "rewards/rejected": -0.00601133331656456, "step": 2 }, { "debug/policy_chosen_logits": -3.135432243347168, "debug/policy_chosen_logps": -203.98123168945312, "debug/policy_rejected_logits": -3.058173179626465, "debug/policy_rejected_logps": -171.8382568359375, "debug/reference_chosen_logps": -206.17086791992188, "debug/reference_rejected_logps": -172.7147216796875, "epoch": 0.058823529411764705, "grad_norm": 11.448652871780954, "learning_rate": 1e-06, "logits/chosen": -3.135432243347168, "logits/rejected": -3.058173179626465, "logps/chosen": -203.98123168945312, "logps/rejected": -171.8382568359375, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 0.02189634181559086, "rewards/margins": 0.013131675310432911, "rewards/rejected": 0.008764667436480522, "step": 3 }, { "debug/policy_chosen_logits": -3.0565543174743652, "debug/policy_chosen_logps": -206.8490447998047, "debug/policy_rejected_logits": -2.9550375938415527, "debug/policy_rejected_logps": -174.601318359375, "debug/reference_chosen_logps": -206.09422302246094, "debug/reference_rejected_logps": -172.06332397460938, "epoch": 0.0784313725490196, "grad_norm": 8.681717474563778, "learning_rate": 1e-06, "logits/chosen": -3.0565543174743652, "logits/rejected": -2.9550375938415527, "logps/chosen": -206.8490447998047, "logps/rejected": -174.601318359375, "loss": 0.4855, "rewards/accuracies": 0.75, "rewards/chosen": -0.007548102643340826, "rewards/margins": 0.017831895500421524, "rewards/rejected": -0.025379998609423637, "step": 4 }, { "debug/policy_chosen_logits": -3.0399742126464844, "debug/policy_chosen_logps": -206.2752227783203, "debug/policy_rejected_logits": -2.9232938289642334, "debug/policy_rejected_logps": -156.70419311523438, "debug/reference_chosen_logps": -205.56788635253906, "debug/reference_rejected_logps": -150.99920654296875, "epoch": 0.09803921568627451, "grad_norm": 8.125347499138313, "learning_rate": 1e-06, "logits/chosen": -3.0399742126464844, "logits/rejected": -2.9232938289642334, "logps/chosen": -206.2752227783203, "logps/rejected": -156.70419311523438, "loss": 0.4648, "rewards/accuracies": 1.0, "rewards/chosen": -0.00707334466278553, "rewards/margins": 0.04997648298740387, "rewards/rejected": -0.05704982578754425, "step": 5 }, { "debug/policy_chosen_logits": -3.2029356956481934, "debug/policy_chosen_logps": -206.69122314453125, "debug/policy_rejected_logits": -3.122507095336914, "debug/policy_rejected_logps": -199.9852752685547, "debug/reference_chosen_logps": -210.3363494873047, "debug/reference_rejected_logps": -200.11654663085938, "epoch": 0.11764705882352941, "grad_norm": 10.680997056066209, "learning_rate": 1e-06, "logits/chosen": -3.2029356956481934, "logits/rejected": -3.122507095336914, "logps/chosen": -206.69122314453125, "logps/rejected": -199.9852752685547, "loss": 0.4523, "rewards/accuracies": 1.0, "rewards/chosen": 0.03645110875368118, "rewards/margins": 0.03513820841908455, "rewards/rejected": 0.0013129040598869324, "step": 6 }, { "debug/policy_chosen_logits": -3.203911781311035, "debug/policy_chosen_logps": -197.77503967285156, "debug/policy_rejected_logits": -2.984386444091797, "debug/policy_rejected_logps": -168.83746337890625, "debug/reference_chosen_logps": -199.79531860351562, "debug/reference_rejected_logps": -164.98037719726562, "epoch": 0.13725490196078433, "grad_norm": 7.437519030782708, "learning_rate": 1e-06, "logits/chosen": -3.203911781311035, "logits/rejected": -2.984386444091797, "logps/chosen": -197.77503967285156, "logps/rejected": -168.83746337890625, "loss": 0.4436, "rewards/accuracies": 0.875, "rewards/chosen": 0.02020280808210373, "rewards/margins": 0.058773577213287354, "rewards/rejected": -0.038570769131183624, "step": 7 }, { "debug/policy_chosen_logits": -3.154578447341919, "debug/policy_chosen_logps": -211.52667236328125, "debug/policy_rejected_logits": -3.1434481143951416, "debug/policy_rejected_logps": -185.707763671875, "debug/reference_chosen_logps": -213.74742126464844, "debug/reference_rejected_logps": -184.40182495117188, "epoch": 0.1568627450980392, "grad_norm": 6.335784838530231, "learning_rate": 1e-06, "logits/chosen": -3.154578447341919, "logits/rejected": -3.1434481143951416, "logps/chosen": -211.52667236328125, "logps/rejected": -185.707763671875, "loss": 0.4552, "rewards/accuracies": 0.75, "rewards/chosen": 0.022207507863640785, "rewards/margins": 0.03526674211025238, "rewards/rejected": -0.013059234246611595, "step": 8 }, { "debug/policy_chosen_logits": -3.2226743698120117, "debug/policy_chosen_logps": -202.90101623535156, "debug/policy_rejected_logits": -3.0353806018829346, "debug/policy_rejected_logps": -164.21823120117188, "debug/reference_chosen_logps": -207.07034301757812, "debug/reference_rejected_logps": -154.7605438232422, "epoch": 0.17647058823529413, "grad_norm": 6.262440890126476, "learning_rate": 1e-06, "logits/chosen": -3.2226743698120117, "logits/rejected": -3.0353806018829346, "logps/chosen": -202.90101623535156, "logps/rejected": -164.21823120117188, "loss": 0.4059, "rewards/accuracies": 1.0, "rewards/chosen": 0.04169316962361336, "rewards/margins": 0.13627006113529205, "rewards/rejected": -0.0945768877863884, "step": 9 }, { "debug/policy_chosen_logits": -3.375203847885132, "debug/policy_chosen_logps": -182.47267150878906, "debug/policy_rejected_logits": -2.9658010005950928, "debug/policy_rejected_logps": -173.54071044921875, "debug/reference_chosen_logps": -190.21826171875, "debug/reference_rejected_logps": -162.2075958251953, "epoch": 0.19607843137254902, "grad_norm": 6.1849049071096225, "learning_rate": 1e-06, "logits/chosen": -3.375203847885132, "logits/rejected": -2.9658010005950928, "logps/chosen": -182.47267150878906, "logps/rejected": -173.54071044921875, "loss": 0.3573, "rewards/accuracies": 1.0, "rewards/chosen": 0.07745584845542908, "rewards/margins": 0.19078706204891205, "rewards/rejected": -0.11333122849464417, "step": 10 }, { "debug/policy_chosen_logits": -3.1753156185150146, "debug/policy_chosen_logps": -203.0382080078125, "debug/policy_rejected_logits": -2.948859691619873, "debug/policy_rejected_logps": -178.07508850097656, "debug/reference_chosen_logps": -212.055908203125, "debug/reference_rejected_logps": -169.71719360351562, "epoch": 0.21568627450980393, "grad_norm": 5.773073574456393, "learning_rate": 1e-06, "logits/chosen": -3.1753156185150146, "logits/rejected": -2.948859691619873, "logps/chosen": -203.0382080078125, "logps/rejected": -178.07508850097656, "loss": 0.3633, "rewards/accuracies": 0.875, "rewards/chosen": 0.09017696976661682, "rewards/margins": 0.17375589907169342, "rewards/rejected": -0.0835789293050766, "step": 11 }, { "debug/policy_chosen_logits": -3.276843309402466, "debug/policy_chosen_logps": -199.274169921875, "debug/policy_rejected_logits": -3.1320600509643555, "debug/policy_rejected_logps": -180.1590576171875, "debug/reference_chosen_logps": -208.1540985107422, "debug/reference_rejected_logps": -177.0548095703125, "epoch": 0.23529411764705882, "grad_norm": 5.87667348979612, "learning_rate": 1e-06, "logits/chosen": -3.276843309402466, "logits/rejected": -3.1320600509643555, "logps/chosen": -199.274169921875, "logps/rejected": -180.1590576171875, "loss": 0.3496, "rewards/accuracies": 0.875, "rewards/chosen": 0.08879929780960083, "rewards/margins": 0.11984152346849442, "rewards/rejected": -0.03104221448302269, "step": 12 }, { "debug/policy_chosen_logits": -3.1219661235809326, "debug/policy_chosen_logps": -191.55943298339844, "debug/policy_rejected_logits": -2.9815196990966797, "debug/policy_rejected_logps": -176.33534240722656, "debug/reference_chosen_logps": -196.898193359375, "debug/reference_rejected_logps": -166.8399658203125, "epoch": 0.2549019607843137, "grad_norm": 5.024914031419329, "learning_rate": 1e-06, "logits/chosen": -3.1219661235809326, "logits/rejected": -2.9815196990966797, "logps/chosen": -191.55943298339844, "logps/rejected": -176.33534240722656, "loss": 0.37, "rewards/accuracies": 0.875, "rewards/chosen": 0.053387563675642014, "rewards/margins": 0.14834120869636536, "rewards/rejected": -0.09495364874601364, "step": 13 }, { "debug/policy_chosen_logits": -3.144469738006592, "debug/policy_chosen_logps": -218.86590576171875, "debug/policy_rejected_logits": -2.890340566635132, "debug/policy_rejected_logps": -181.0338134765625, "debug/reference_chosen_logps": -229.2056884765625, "debug/reference_rejected_logps": -170.25051879882812, "epoch": 0.27450980392156865, "grad_norm": 5.156857153303333, "learning_rate": 1e-06, "logits/chosen": -3.144469738006592, "logits/rejected": -2.890340566635132, "logps/chosen": -218.86590576171875, "logps/rejected": -181.0338134765625, "loss": 0.3451, "rewards/accuracies": 1.0, "rewards/chosen": 0.10339776426553726, "rewards/margins": 0.21123060584068298, "rewards/rejected": -0.10783283412456512, "step": 14 }, { "debug/policy_chosen_logits": -3.2239673137664795, "debug/policy_chosen_logps": -201.61410522460938, "debug/policy_rejected_logits": -2.989596128463745, "debug/policy_rejected_logps": -184.40621948242188, "debug/reference_chosen_logps": -210.53994750976562, "debug/reference_rejected_logps": -177.02459716796875, "epoch": 0.29411764705882354, "grad_norm": 5.282418768529003, "learning_rate": 1e-06, "logits/chosen": -3.2239673137664795, "logits/rejected": -2.989596128463745, "logps/chosen": -201.61410522460938, "logps/rejected": -184.40621948242188, "loss": 0.3296, "rewards/accuracies": 0.75, "rewards/chosen": 0.08925840258598328, "rewards/margins": 0.16307473182678223, "rewards/rejected": -0.07381631433963776, "step": 15 }, { "debug/policy_chosen_logits": -3.151991128921509, "debug/policy_chosen_logps": -203.8999481201172, "debug/policy_rejected_logits": -3.007563591003418, "debug/policy_rejected_logps": -174.22891235351562, "debug/reference_chosen_logps": -212.34693908691406, "debug/reference_rejected_logps": -161.72183227539062, "epoch": 0.3137254901960784, "grad_norm": 4.989706271320969, "learning_rate": 1e-06, "logits/chosen": -3.151991128921509, "logits/rejected": -3.007563591003418, "logps/chosen": -203.8999481201172, "logps/rejected": -174.22891235351562, "loss": 0.3299, "rewards/accuracies": 1.0, "rewards/chosen": 0.08446990698575974, "rewards/margins": 0.2095407247543335, "rewards/rejected": -0.12507081031799316, "step": 16 }, { "debug/policy_chosen_logits": -3.325493812561035, "debug/policy_chosen_logps": -196.61074829101562, "debug/policy_rejected_logits": -3.114488124847412, "debug/policy_rejected_logps": -185.79983520507812, "debug/reference_chosen_logps": -209.38027954101562, "debug/reference_rejected_logps": -171.8429412841797, "epoch": 0.3333333333333333, "grad_norm": 3.775137601116182, "learning_rate": 1e-06, "logits/chosen": -3.325493812561035, "logits/rejected": -3.114488124847412, "logps/chosen": -196.61074829101562, "logps/rejected": -185.79983520507812, "loss": 0.3239, "rewards/accuracies": 0.625, "rewards/chosen": 0.127695232629776, "rewards/margins": 0.2672642469406128, "rewards/rejected": -0.13956904411315918, "step": 17 }, { "debug/policy_chosen_logits": -3.4259612560272217, "debug/policy_chosen_logps": -196.9546356201172, "debug/policy_rejected_logits": -3.054805278778076, "debug/policy_rejected_logps": -193.873046875, "debug/reference_chosen_logps": -214.143310546875, "debug/reference_rejected_logps": -176.38931274414062, "epoch": 0.35294117647058826, "grad_norm": 4.462985781931295, "learning_rate": 1e-06, "logits/chosen": -3.4259612560272217, "logits/rejected": -3.054805278778076, "logps/chosen": -196.9546356201172, "logps/rejected": -193.873046875, "loss": 0.2735, "rewards/accuracies": 0.875, "rewards/chosen": 0.17188671231269836, "rewards/margins": 0.3467240333557129, "rewards/rejected": -0.17483732104301453, "step": 18 }, { "debug/policy_chosen_logits": -3.160147190093994, "debug/policy_chosen_logps": -204.79298400878906, "debug/policy_rejected_logits": -2.9090147018432617, "debug/policy_rejected_logps": -190.56497192382812, "debug/reference_chosen_logps": -227.2172088623047, "debug/reference_rejected_logps": -159.18490600585938, "epoch": 0.37254901960784315, "grad_norm": 4.343866551481763, "learning_rate": 1e-06, "logits/chosen": -3.160147190093994, "logits/rejected": -2.9090147018432617, "logps/chosen": -204.79298400878906, "logps/rejected": -190.56497192382812, "loss": 0.203, "rewards/accuracies": 0.875, "rewards/chosen": 0.22424226999282837, "rewards/margins": 0.5380429625511169, "rewards/rejected": -0.3138006925582886, "step": 19 }, { "debug/policy_chosen_logits": -3.393888235092163, "debug/policy_chosen_logps": -183.07261657714844, "debug/policy_rejected_logits": -2.96246337890625, "debug/policy_rejected_logps": -193.21229553222656, "debug/reference_chosen_logps": -208.93617248535156, "debug/reference_rejected_logps": -170.37913513183594, "epoch": 0.39215686274509803, "grad_norm": 6.6782746545956355, "learning_rate": 1e-06, "logits/chosen": -3.393888235092163, "logits/rejected": -2.96246337890625, "logps/chosen": -183.07261657714844, "logps/rejected": -193.21229553222656, "loss": 0.2687, "rewards/accuracies": 0.75, "rewards/chosen": 0.2586354911327362, "rewards/margins": 0.48696693778038025, "rewards/rejected": -0.22833144664764404, "step": 20 }, { "debug/policy_chosen_logits": -3.3085851669311523, "debug/policy_chosen_logps": -171.0208282470703, "debug/policy_rejected_logits": -2.9337971210479736, "debug/policy_rejected_logps": -190.86468505859375, "debug/reference_chosen_logps": -194.40255737304688, "debug/reference_rejected_logps": -161.28668212890625, "epoch": 0.4117647058823529, "grad_norm": 5.46037647182631, "learning_rate": 1e-06, "logits/chosen": -3.3085851669311523, "logits/rejected": -2.9337971210479736, "logps/chosen": -171.0208282470703, "logps/rejected": -190.86468505859375, "loss": 0.2322, "rewards/accuracies": 0.875, "rewards/chosen": 0.23381738364696503, "rewards/margins": 0.529597282409668, "rewards/rejected": -0.29577991366386414, "step": 21 }, { "debug/policy_chosen_logits": -3.2622666358947754, "debug/policy_chosen_logps": -175.09979248046875, "debug/policy_rejected_logits": -2.974461555480957, "debug/policy_rejected_logps": -195.9930877685547, "debug/reference_chosen_logps": -195.777587890625, "debug/reference_rejected_logps": -166.50228881835938, "epoch": 0.43137254901960786, "grad_norm": 4.01463505036889, "learning_rate": 1e-06, "logits/chosen": -3.2622666358947754, "logits/rejected": -2.974461555480957, "logps/chosen": -175.09979248046875, "logps/rejected": -195.9930877685547, "loss": 0.1799, "rewards/accuracies": 0.75, "rewards/chosen": 0.2067781686782837, "rewards/margins": 0.5016859769821167, "rewards/rejected": -0.2949078381061554, "step": 22 }, { "debug/policy_chosen_logits": -3.2856056690216064, "debug/policy_chosen_logps": -201.78982543945312, "debug/policy_rejected_logits": -3.0591306686401367, "debug/policy_rejected_logps": -190.1474609375, "debug/reference_chosen_logps": -227.23031616210938, "debug/reference_rejected_logps": -170.58262634277344, "epoch": 0.45098039215686275, "grad_norm": 3.041748343544453, "learning_rate": 1e-06, "logits/chosen": -3.2856056690216064, "logits/rejected": -3.0591306686401367, "logps/chosen": -201.78982543945312, "logps/rejected": -190.1474609375, "loss": 0.2457, "rewards/accuracies": 0.75, "rewards/chosen": 0.254404753446579, "rewards/margins": 0.4500531554222107, "rewards/rejected": -0.1956484317779541, "step": 23 }, { "debug/policy_chosen_logits": -3.3577394485473633, "debug/policy_chosen_logps": -181.08404541015625, "debug/policy_rejected_logits": -2.9297733306884766, "debug/policy_rejected_logps": -198.900634765625, "debug/reference_chosen_logps": -208.0858917236328, "debug/reference_rejected_logps": -156.55642700195312, "epoch": 0.47058823529411764, "grad_norm": 4.54525191842216, "learning_rate": 1e-06, "logits/chosen": -3.3577394485473633, "logits/rejected": -2.9297733306884766, "logps/chosen": -181.08404541015625, "logps/rejected": -198.900634765625, "loss": 0.1901, "rewards/accuracies": 0.875, "rewards/chosen": 0.2700183689594269, "rewards/margins": 0.6934603452682495, "rewards/rejected": -0.42344197630882263, "step": 24 }, { "debug/policy_chosen_logits": -3.2948226928710938, "debug/policy_chosen_logps": -200.36212158203125, "debug/policy_rejected_logits": -2.963944435119629, "debug/policy_rejected_logps": -209.5209503173828, "debug/reference_chosen_logps": -226.37498474121094, "debug/reference_rejected_logps": -175.77857971191406, "epoch": 0.49019607843137253, "grad_norm": 4.930573763605194, "learning_rate": 1e-06, "logits/chosen": -3.2948226928710938, "logits/rejected": -2.963944435119629, "logps/chosen": -200.36212158203125, "logps/rejected": -209.5209503173828, "loss": 0.2059, "rewards/accuracies": 0.875, "rewards/chosen": 0.2601286768913269, "rewards/margins": 0.5975522994995117, "rewards/rejected": -0.3374236226081848, "step": 25 }, { "debug/policy_chosen_logits": -3.302710771560669, "debug/policy_chosen_logps": -171.43429565429688, "debug/policy_rejected_logits": -3.040092945098877, "debug/policy_rejected_logps": -191.50186157226562, "debug/reference_chosen_logps": -197.53707885742188, "debug/reference_rejected_logps": -173.1179656982422, "epoch": 0.5098039215686274, "grad_norm": 3.000694158056279, "learning_rate": 1e-06, "logits/chosen": -3.302710771560669, "logits/rejected": -3.040092945098877, "logps/chosen": -171.43429565429688, "logps/rejected": -191.50186157226562, "loss": 0.2073, "rewards/accuracies": 0.75, "rewards/chosen": 0.26102781295776367, "rewards/margins": 0.4448668956756592, "rewards/rejected": -0.1838391125202179, "step": 26 }, { "debug/policy_chosen_logits": -3.2088711261749268, "debug/policy_chosen_logps": -184.981689453125, "debug/policy_rejected_logits": -3.0571742057800293, "debug/policy_rejected_logps": -180.77044677734375, "debug/reference_chosen_logps": -216.10214233398438, "debug/reference_rejected_logps": -166.07806396484375, "epoch": 0.5294117647058824, "grad_norm": 2.949818169869179, "learning_rate": 1e-06, "logits/chosen": -3.2088711261749268, "logits/rejected": -3.0571742057800293, "logps/chosen": -184.981689453125, "logps/rejected": -180.77044677734375, "loss": 0.1885, "rewards/accuracies": 0.75, "rewards/chosen": 0.3112045228481293, "rewards/margins": 0.4581283926963806, "rewards/rejected": -0.14692384004592896, "step": 27 }, { "debug/policy_chosen_logits": -3.183767557144165, "debug/policy_chosen_logps": -189.71145629882812, "debug/policy_rejected_logits": -3.0254147052764893, "debug/policy_rejected_logps": -192.36737060546875, "debug/reference_chosen_logps": -225.47145080566406, "debug/reference_rejected_logps": -174.45132446289062, "epoch": 0.5490196078431373, "grad_norm": 5.145448324683731, "learning_rate": 1e-06, "logits/chosen": -3.183767557144165, "logits/rejected": -3.0254147052764893, "logps/chosen": -189.71145629882812, "logps/rejected": -192.36737060546875, "loss": 0.22, "rewards/accuracies": 0.75, "rewards/chosen": 0.3575999140739441, "rewards/margins": 0.5367603898048401, "rewards/rejected": -0.17916050553321838, "step": 28 }, { "debug/policy_chosen_logits": -3.4387083053588867, "debug/policy_chosen_logps": -162.32211303710938, "debug/policy_rejected_logits": -2.9201865196228027, "debug/policy_rejected_logps": -201.7137451171875, "debug/reference_chosen_logps": -197.59341430664062, "debug/reference_rejected_logps": -162.81692504882812, "epoch": 0.5686274509803921, "grad_norm": 2.2015503343344056, "learning_rate": 1e-06, "logits/chosen": -3.4387083053588867, "logits/rejected": -2.9201865196228027, "logps/chosen": -162.32211303710938, "logps/rejected": -201.7137451171875, "loss": 0.1582, "rewards/accuracies": 0.875, "rewards/chosen": 0.352713018655777, "rewards/margins": 0.7416810989379883, "rewards/rejected": -0.3889680504798889, "step": 29 }, { "debug/policy_chosen_logits": -3.392920732498169, "debug/policy_chosen_logps": -168.54994201660156, "debug/policy_rejected_logits": -3.1080636978149414, "debug/policy_rejected_logps": -187.03036499023438, "debug/reference_chosen_logps": -202.75399780273438, "debug/reference_rejected_logps": -165.93167114257812, "epoch": 0.5882352941176471, "grad_norm": 4.644591680704765, "learning_rate": 1e-06, "logits/chosen": -3.392920732498169, "logits/rejected": -3.1080636978149414, "logps/chosen": -168.54994201660156, "logps/rejected": -187.03036499023438, "loss": 0.2242, "rewards/accuracies": 0.75, "rewards/chosen": 0.3420405983924866, "rewards/margins": 0.5530275106430054, "rewards/rejected": -0.21098692715168, "step": 30 }, { "debug/policy_chosen_logits": -3.295290231704712, "debug/policy_chosen_logps": -187.04917907714844, "debug/policy_rejected_logits": -3.0415842533111572, "debug/policy_rejected_logps": -190.411865234375, "debug/reference_chosen_logps": -225.02894592285156, "debug/reference_rejected_logps": -176.0540771484375, "epoch": 0.6078431372549019, "grad_norm": 5.397720739472804, "learning_rate": 1e-06, "logits/chosen": -3.295290231704712, "logits/rejected": -3.0415842533111572, "logps/chosen": -187.04917907714844, "logps/rejected": -190.411865234375, "loss": 0.2502, "rewards/accuracies": 1.0, "rewards/chosen": 0.3797976076602936, "rewards/margins": 0.523375391960144, "rewards/rejected": -0.14357778429985046, "step": 31 }, { "debug/policy_chosen_logits": -3.2399747371673584, "debug/policy_chosen_logps": -183.7709197998047, "debug/policy_rejected_logits": -3.0016558170318604, "debug/policy_rejected_logps": -206.39227294921875, "debug/reference_chosen_logps": -215.31048583984375, "debug/reference_rejected_logps": -162.4297637939453, "epoch": 0.6274509803921569, "grad_norm": 2.9961110652965304, "learning_rate": 1e-06, "logits/chosen": -3.2399747371673584, "logits/rejected": -3.0016558170318604, "logps/chosen": -183.7709197998047, "logps/rejected": -206.39227294921875, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 0.3153955340385437, "rewards/margins": 0.7550206780433655, "rewards/rejected": -0.4396251440048218, "step": 32 }, { "debug/policy_chosen_logits": -3.337228536605835, "debug/policy_chosen_logps": -181.0286865234375, "debug/policy_rejected_logits": -3.016998529434204, "debug/policy_rejected_logps": -207.33807373046875, "debug/reference_chosen_logps": -214.73683166503906, "debug/reference_rejected_logps": -155.3048095703125, "epoch": 0.6470588235294118, "grad_norm": 3.3411699830241126, "learning_rate": 1e-06, "logits/chosen": -3.337228536605835, "logits/rejected": -3.016998529434204, "logps/chosen": -181.0286865234375, "logps/rejected": -207.33807373046875, "loss": 0.2193, "rewards/accuracies": 1.0, "rewards/chosen": 0.3370813727378845, "rewards/margins": 0.8574139475822449, "rewards/rejected": -0.5203325748443604, "step": 33 }, { "debug/policy_chosen_logits": -3.252537488937378, "debug/policy_chosen_logps": -177.55934143066406, "debug/policy_rejected_logits": -2.9824187755584717, "debug/policy_rejected_logps": -201.26881408691406, "debug/reference_chosen_logps": -204.36630249023438, "debug/reference_rejected_logps": -173.47714233398438, "epoch": 0.6666666666666666, "grad_norm": 4.181592807951308, "learning_rate": 1e-06, "logits/chosen": -3.252537488937378, "logits/rejected": -2.9824187755584717, "logps/chosen": -177.55934143066406, "logps/rejected": -201.26881408691406, "loss": 0.1932, "rewards/accuracies": 0.875, "rewards/chosen": 0.26806968450546265, "rewards/margins": 0.545986533164978, "rewards/rejected": -0.2779168486595154, "step": 34 }, { "debug/policy_chosen_logits": -3.2519893646240234, "debug/policy_chosen_logps": -189.9286346435547, "debug/policy_rejected_logits": -3.073943614959717, "debug/policy_rejected_logps": -193.5225067138672, "debug/reference_chosen_logps": -217.17962646484375, "debug/reference_rejected_logps": -160.67848205566406, "epoch": 0.6862745098039216, "grad_norm": 4.389149302503785, "learning_rate": 1e-06, "logits/chosen": -3.2519893646240234, "logits/rejected": -3.073943614959717, "logps/chosen": -189.9286346435547, "logps/rejected": -193.5225067138672, "loss": 0.1489, "rewards/accuracies": 0.75, "rewards/chosen": 0.272509902715683, "rewards/margins": 0.6009501218795776, "rewards/rejected": -0.32844021916389465, "step": 35 }, { "debug/policy_chosen_logits": -3.3464720249176025, "debug/policy_chosen_logps": -179.3638916015625, "debug/policy_rejected_logits": -3.137779951095581, "debug/policy_rejected_logps": -190.06446838378906, "debug/reference_chosen_logps": -211.63006591796875, "debug/reference_rejected_logps": -169.59912109375, "epoch": 0.7058823529411765, "grad_norm": 3.417151698316011, "learning_rate": 1e-06, "logits/chosen": -3.3464720249176025, "logits/rejected": -3.137779951095581, "logps/chosen": -179.3638916015625, "logps/rejected": -190.06446838378906, "loss": 0.2432, "rewards/accuracies": 0.75, "rewards/chosen": 0.3226618468761444, "rewards/margins": 0.5273153781890869, "rewards/rejected": -0.2046535164117813, "step": 36 }, { "debug/policy_chosen_logits": -3.222252607345581, "debug/policy_chosen_logps": -172.60418701171875, "debug/policy_rejected_logits": -3.0409443378448486, "debug/policy_rejected_logps": -191.89566040039062, "debug/reference_chosen_logps": -204.07984924316406, "debug/reference_rejected_logps": -162.96307373046875, "epoch": 0.7254901960784313, "grad_norm": 4.133875037463127, "learning_rate": 1e-06, "logits/chosen": -3.222252607345581, "logits/rejected": -3.0409443378448486, "logps/chosen": -172.60418701171875, "logps/rejected": -191.89566040039062, "loss": 0.223, "rewards/accuracies": 1.0, "rewards/chosen": 0.31475669145584106, "rewards/margins": 0.6040827035903931, "rewards/rejected": -0.2893260419368744, "step": 37 }, { "debug/policy_chosen_logits": -3.3690860271453857, "debug/policy_chosen_logps": -179.07205200195312, "debug/policy_rejected_logits": -2.9036881923675537, "debug/policy_rejected_logps": -196.90289306640625, "debug/reference_chosen_logps": -211.8248291015625, "debug/reference_rejected_logps": -159.6433563232422, "epoch": 0.7450980392156863, "grad_norm": 4.100516480274074, "learning_rate": 1e-06, "logits/chosen": -3.3690860271453857, "logits/rejected": -2.9036881923675537, "logps/chosen": -179.07205200195312, "logps/rejected": -196.90289306640625, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 0.3275277614593506, "rewards/margins": 0.7001230716705322, "rewards/rejected": -0.37259525060653687, "step": 38 }, { "debug/policy_chosen_logits": -3.253589153289795, "debug/policy_chosen_logps": -187.75148010253906, "debug/policy_rejected_logits": -3.052138328552246, "debug/policy_rejected_logps": -185.89328002929688, "debug/reference_chosen_logps": -218.99688720703125, "debug/reference_rejected_logps": -167.86203002929688, "epoch": 0.7647058823529411, "grad_norm": 3.8551603355976303, "learning_rate": 1e-06, "logits/chosen": -3.253589153289795, "logits/rejected": -3.052138328552246, "logps/chosen": -187.75148010253906, "logps/rejected": -185.89328002929688, "loss": 0.2108, "rewards/accuracies": 0.875, "rewards/chosen": 0.3124539256095886, "rewards/margins": 0.4927663207054138, "rewards/rejected": -0.18031242489814758, "step": 39 }, { "debug/policy_chosen_logits": -3.397043228149414, "debug/policy_chosen_logps": -184.57046508789062, "debug/policy_rejected_logits": -2.9531567096710205, "debug/policy_rejected_logps": -205.21624755859375, "debug/reference_chosen_logps": -216.28897094726562, "debug/reference_rejected_logps": -167.5351104736328, "epoch": 0.7843137254901961, "grad_norm": 4.209706885927611, "learning_rate": 1e-06, "logits/chosen": -3.397043228149414, "logits/rejected": -2.9531567096710205, "logps/chosen": -184.57046508789062, "logps/rejected": -205.21624755859375, "loss": 0.1481, "rewards/accuracies": 0.875, "rewards/chosen": 0.3171852231025696, "rewards/margins": 0.6939965486526489, "rewards/rejected": -0.3768113851547241, "step": 40 }, { "debug/policy_chosen_logits": -3.467691421508789, "debug/policy_chosen_logps": -174.2329864501953, "debug/policy_rejected_logits": -3.1792116165161133, "debug/policy_rejected_logps": -200.54534912109375, "debug/reference_chosen_logps": -201.8401641845703, "debug/reference_rejected_logps": -168.33837890625, "epoch": 0.803921568627451, "grad_norm": 4.763633998186518, "learning_rate": 1e-06, "logits/chosen": -3.467691421508789, "logits/rejected": -3.1792116165161133, "logps/chosen": -174.2329864501953, "logps/rejected": -200.54534912109375, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": 0.27607178688049316, "rewards/margins": 0.5981414318084717, "rewards/rejected": -0.3220696747303009, "step": 41 }, { "debug/policy_chosen_logits": -3.3941807746887207, "debug/policy_chosen_logps": -173.32530212402344, "debug/policy_rejected_logits": -3.044524908065796, "debug/policy_rejected_logps": -195.6669921875, "debug/reference_chosen_logps": -198.82882690429688, "debug/reference_rejected_logps": -170.7233123779297, "epoch": 0.8235294117647058, "grad_norm": 5.656950290864125, "learning_rate": 1e-06, "logits/chosen": -3.3941807746887207, "logits/rejected": -3.044524908065796, "logps/chosen": -173.32530212402344, "logps/rejected": -195.6669921875, "loss": 0.1758, "rewards/accuracies": 1.0, "rewards/chosen": 0.25503525137901306, "rewards/margins": 0.5044721364974976, "rewards/rejected": -0.2494368553161621, "step": 42 }, { "debug/policy_chosen_logits": -3.3669402599334717, "debug/policy_chosen_logps": -174.84701538085938, "debug/policy_rejected_logits": -3.1404428482055664, "debug/policy_rejected_logps": -201.5968017578125, "debug/reference_chosen_logps": -205.5618438720703, "debug/reference_rejected_logps": -192.6947021484375, "epoch": 0.8431372549019608, "grad_norm": 3.6274998713619695, "learning_rate": 1e-06, "logits/chosen": -3.3669402599334717, "logits/rejected": -3.1404428482055664, "logps/chosen": -174.84701538085938, "logps/rejected": -201.5968017578125, "loss": 0.1594, "rewards/accuracies": 0.875, "rewards/chosen": 0.307148277759552, "rewards/margins": 0.3961692452430725, "rewards/rejected": -0.08902095258235931, "step": 43 }, { "debug/policy_chosen_logits": -3.4695048332214355, "debug/policy_chosen_logps": -172.99578857421875, "debug/policy_rejected_logits": -3.082566261291504, "debug/policy_rejected_logps": -192.80035400390625, "debug/reference_chosen_logps": -203.4981231689453, "debug/reference_rejected_logps": -165.00967407226562, "epoch": 0.8627450980392157, "grad_norm": 5.394067723272868, "learning_rate": 1e-06, "logits/chosen": -3.4695048332214355, "logits/rejected": -3.082566261291504, "logps/chosen": -172.99578857421875, "logps/rejected": -192.80035400390625, "loss": 0.2547, "rewards/accuracies": 0.875, "rewards/chosen": 0.30502331256866455, "rewards/margins": 0.5829300284385681, "rewards/rejected": -0.27790671586990356, "step": 44 }, { "debug/policy_chosen_logits": -3.3292031288146973, "debug/policy_chosen_logps": -170.69662475585938, "debug/policy_rejected_logits": -2.94071364402771, "debug/policy_rejected_logps": -193.63653564453125, "debug/reference_chosen_logps": -202.19802856445312, "debug/reference_rejected_logps": -159.46893310546875, "epoch": 0.8823529411764706, "grad_norm": 5.507456408536459, "learning_rate": 1e-06, "logits/chosen": -3.3292031288146973, "logits/rejected": -2.94071364402771, "logps/chosen": -170.69662475585938, "logps/rejected": -193.63653564453125, "loss": 0.2139, "rewards/accuracies": 0.875, "rewards/chosen": 0.31501394510269165, "rewards/margins": 0.6566898822784424, "rewards/rejected": -0.34167587757110596, "step": 45 }, { "debug/policy_chosen_logits": -3.226253032684326, "debug/policy_chosen_logps": -188.11349487304688, "debug/policy_rejected_logits": -2.985891819000244, "debug/policy_rejected_logps": -193.01876831054688, "debug/reference_chosen_logps": -220.32342529296875, "debug/reference_rejected_logps": -167.2851104736328, "epoch": 0.9019607843137255, "grad_norm": 2.6575090459968886, "learning_rate": 1e-06, "logits/chosen": -3.226253032684326, "logits/rejected": -2.985891819000244, "logps/chosen": -188.11349487304688, "logps/rejected": -193.01876831054688, "loss": 0.1597, "rewards/accuracies": 0.875, "rewards/chosen": 0.32209956645965576, "rewards/margins": 0.5794363021850586, "rewards/rejected": -0.25733667612075806, "step": 46 }, { "debug/policy_chosen_logits": -3.584475517272949, "debug/policy_chosen_logps": -182.8424072265625, "debug/policy_rejected_logits": -3.0511176586151123, "debug/policy_rejected_logps": -198.2120361328125, "debug/reference_chosen_logps": -213.4707794189453, "debug/reference_rejected_logps": -172.90322875976562, "epoch": 0.9215686274509803, "grad_norm": 2.609406640436062, "learning_rate": 1e-06, "logits/chosen": -3.584475517272949, "logits/rejected": -3.0511176586151123, "logps/chosen": -182.8424072265625, "logps/rejected": -198.2120361328125, "loss": 0.1684, "rewards/accuracies": 1.0, "rewards/chosen": 0.3062836825847626, "rewards/margins": 0.5593717098236084, "rewards/rejected": -0.2530880570411682, "step": 47 }, { "debug/policy_chosen_logits": -3.3837730884552, "debug/policy_chosen_logps": -175.48831176757812, "debug/policy_rejected_logits": -3.044293165206909, "debug/policy_rejected_logps": -202.69216918945312, "debug/reference_chosen_logps": -203.88320922851562, "debug/reference_rejected_logps": -172.34735107421875, "epoch": 0.9411764705882353, "grad_norm": 2.2964317049630756, "learning_rate": 1e-06, "logits/chosen": -3.3837730884552, "logits/rejected": -3.044293165206909, "logps/chosen": -175.48831176757812, "logps/rejected": -202.69216918945312, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": 0.2839488983154297, "rewards/margins": 0.5873969793319702, "rewards/rejected": -0.30344805121421814, "step": 48 }, { "debug/policy_chosen_logits": -3.3742902278900146, "debug/policy_chosen_logps": -185.5418243408203, "debug/policy_rejected_logits": -3.0798239707946777, "debug/policy_rejected_logps": -202.81005859375, "debug/reference_chosen_logps": -221.17340087890625, "debug/reference_rejected_logps": -154.19923400878906, "epoch": 0.9607843137254902, "grad_norm": 2.620292236564946, "learning_rate": 1e-06, "logits/chosen": -3.3742902278900146, "logits/rejected": -3.0798239707946777, "logps/chosen": -185.5418243408203, "logps/rejected": -202.81005859375, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 0.3563159108161926, "rewards/margins": 0.8424241542816162, "rewards/rejected": -0.4861082136631012, "step": 49 }, { "debug/policy_chosen_logits": -3.4754183292388916, "debug/policy_chosen_logps": -177.518310546875, "debug/policy_rejected_logits": -3.0591540336608887, "debug/policy_rejected_logps": -195.78494262695312, "debug/reference_chosen_logps": -203.58407592773438, "debug/reference_rejected_logps": -161.39697265625, "epoch": 0.9803921568627451, "grad_norm": 3.1624182697538172, "learning_rate": 1e-06, "logits/chosen": -3.4754183292388916, "logits/rejected": -3.0591540336608887, "logps/chosen": -177.518310546875, "logps/rejected": -195.78494262695312, "loss": 0.1887, "rewards/accuracies": 0.875, "rewards/chosen": 0.2606576681137085, "rewards/margins": 0.6045372486114502, "rewards/rejected": -0.3438795804977417, "step": 50 }, { "debug/policy_chosen_logits": -3.4807093143463135, "debug/policy_chosen_logps": -177.54910278320312, "debug/policy_rejected_logits": -3.035158395767212, "debug/policy_rejected_logps": -203.39454650878906, "debug/reference_chosen_logps": -212.01348876953125, "debug/reference_rejected_logps": -166.06936645507812, "epoch": 1.0, "grad_norm": 2.738075468992105, "learning_rate": 1e-06, "logits/chosen": -3.4807093143463135, "logits/rejected": -3.035158395767212, "logps/chosen": -177.54910278320312, "logps/rejected": -203.39454650878906, "loss": 0.1372, "rewards/accuracies": 0.875, "rewards/chosen": 0.34464380145072937, "rewards/margins": 0.7178955078125, "rewards/rejected": -0.373251736164093, "step": 51 }, { "epoch": 1.0, "step": 51, "total_flos": 0.0, "train_loss": 0.2695229123620426, "train_runtime": 165.9785, "train_samples_per_second": 19.412, "train_steps_per_second": 0.307 } ], "logging_steps": 1, "max_steps": 51, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }