{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 51, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -3.131476879119873, "debug/policy_chosen_logps": -223.49798583984375, "debug/policy_rejected_logits": -3.0218234062194824, "debug/policy_rejected_logps": -181.94036865234375, "debug/reference_chosen_logps": -223.49798583984375, "debug/reference_rejected_logps": -181.94036865234375, "epoch": 0.0196078431372549, "grad_norm": 9.59268936350444, "learning_rate": 1e-06, "logits/chosen": -3.131476879119873, "logits/rejected": -3.0218234062194824, "logps/chosen": -223.49798583984375, "logps/rejected": -181.94036865234375, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -3.1443662643432617, "debug/policy_chosen_logps": -209.216552734375, "debug/policy_rejected_logits": -3.076768159866333, "debug/policy_rejected_logps": -170.2884521484375, "debug/reference_chosen_logps": -209.07872009277344, "debug/reference_rejected_logps": -169.68731689453125, "epoch": 0.0392156862745098, "grad_norm": 8.71018223332819, "learning_rate": 1e-06, "logits/chosen": -3.1443662643432617, "logits/rejected": -3.076768159866333, "logps/chosen": -209.216552734375, "logps/rejected": -170.2884521484375, "loss": 0.4974, "rewards/accuracies": 0.875, "rewards/chosen": -0.001378459855914116, "rewards/margins": 0.004632873460650444, "rewards/rejected": -0.00601133331656456, "step": 2 }, { "debug/policy_chosen_logits": -3.135432243347168, "debug/policy_chosen_logps": -203.98123168945312, "debug/policy_rejected_logits": -3.058173179626465, "debug/policy_rejected_logps": -171.8382568359375, "debug/reference_chosen_logps": -206.17086791992188, "debug/reference_rejected_logps": -172.7147216796875, "epoch": 0.058823529411764705, "grad_norm": 11.448652871780954, "learning_rate": 1e-06, "logits/chosen": -3.135432243347168, "logits/rejected": -3.058173179626465, "logps/chosen": -203.98123168945312, "logps/rejected": -171.8382568359375, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 0.02189634181559086, "rewards/margins": 0.013131675310432911, "rewards/rejected": 0.008764667436480522, "step": 3 }, { "debug/policy_chosen_logits": -3.0565543174743652, "debug/policy_chosen_logps": -206.8490447998047, "debug/policy_rejected_logits": -2.9550375938415527, "debug/policy_rejected_logps": -174.601318359375, "debug/reference_chosen_logps": -206.09422302246094, "debug/reference_rejected_logps": -172.06332397460938, "epoch": 0.0784313725490196, "grad_norm": 8.681717474563778, "learning_rate": 1e-06, "logits/chosen": -3.0565543174743652, "logits/rejected": -2.9550375938415527, "logps/chosen": -206.8490447998047, "logps/rejected": -174.601318359375, "loss": 0.4855, "rewards/accuracies": 0.75, "rewards/chosen": -0.007548102643340826, "rewards/margins": 0.017831895500421524, "rewards/rejected": -0.025379998609423637, "step": 4 }, { "debug/policy_chosen_logits": -3.0399742126464844, "debug/policy_chosen_logps": -206.2752227783203, "debug/policy_rejected_logits": -2.9232938289642334, "debug/policy_rejected_logps": -156.70419311523438, "debug/reference_chosen_logps": -205.56788635253906, "debug/reference_rejected_logps": -150.99920654296875, "epoch": 0.09803921568627451, "grad_norm": 8.125347499138313, "learning_rate": 1e-06, "logits/chosen": -3.0399742126464844, "logits/rejected": -2.9232938289642334, "logps/chosen": -206.2752227783203, "logps/rejected": -156.70419311523438, "loss": 0.4648, "rewards/accuracies": 1.0, "rewards/chosen": -0.00707334466278553, "rewards/margins": 0.04997648298740387, "rewards/rejected": -0.05704982578754425, "step": 5 }, { "debug/policy_chosen_logits": -3.2029356956481934, "debug/policy_chosen_logps": -206.69122314453125, "debug/policy_rejected_logits": -3.122507095336914, "debug/policy_rejected_logps": -199.9852752685547, "debug/reference_chosen_logps": -210.3363494873047, "debug/reference_rejected_logps": -200.11654663085938, "epoch": 0.11764705882352941, "grad_norm": 10.6811806285762, "learning_rate": 1e-06, "logits/chosen": -3.2029356956481934, "logits/rejected": -3.122507095336914, "logps/chosen": -206.69122314453125, "logps/rejected": -199.9852752685547, "loss": 0.4523, "rewards/accuracies": 1.0, "rewards/chosen": 0.03645110875368118, "rewards/margins": 0.03513820841908455, "rewards/rejected": 0.0013129040598869324, "step": 6 }, { "debug/policy_chosen_logits": -3.202118396759033, "debug/policy_chosen_logps": -198.13885498046875, "debug/policy_rejected_logits": -2.9787750244140625, "debug/policy_rejected_logps": -169.29666137695312, "debug/reference_chosen_logps": -199.79531860351562, "debug/reference_rejected_logps": -164.98037719726562, "epoch": 0.13725490196078433, "grad_norm": 7.511392268060962, "learning_rate": 1e-06, "logits/chosen": -3.202118396759033, "logits/rejected": -2.9787750244140625, "logps/chosen": -198.13885498046875, "logps/rejected": -169.29666137695312, "loss": 0.4439, "rewards/accuracies": 0.875, "rewards/chosen": 0.01656484603881836, "rewards/margins": 0.05972753465175629, "rewards/rejected": -0.04316268861293793, "step": 7 }, { "debug/policy_chosen_logits": -3.1585052013397217, "debug/policy_chosen_logps": -209.9014892578125, "debug/policy_rejected_logits": -3.147646427154541, "debug/policy_rejected_logps": -184.0531005859375, "debug/reference_chosen_logps": -213.74742126464844, "debug/reference_rejected_logps": -184.40182495117188, "epoch": 0.1568627450980392, "grad_norm": 9.384905399946895, "learning_rate": 1e-06, "logits/chosen": -3.1585052013397217, "logits/rejected": -3.147646427154541, "logps/chosen": -209.9014892578125, "logps/rejected": -184.0531005859375, "loss": 0.4553, "rewards/accuracies": 0.75, "rewards/chosen": 0.03845922276377678, "rewards/margins": 0.03497195988893509, "rewards/rejected": 0.00348726287484169, "step": 8 }, { "debug/policy_chosen_logits": -3.219722032546997, "debug/policy_chosen_logps": -204.11474609375, "debug/policy_rejected_logits": -3.0252864360809326, "debug/policy_rejected_logps": -165.35894775390625, "debug/reference_chosen_logps": -207.07034301757812, "debug/reference_rejected_logps": -154.7605438232422, "epoch": 0.17647058823529413, "grad_norm": 6.128726988165846, "learning_rate": 1e-06, "logits/chosen": -3.219722032546997, "logits/rejected": -3.0252864360809326, "logps/chosen": -204.11474609375, "logps/rejected": -165.35894775390625, "loss": 0.4068, "rewards/accuracies": 1.0, "rewards/chosen": 0.02955583482980728, "rewards/margins": 0.1355399787425995, "rewards/rejected": -0.1059841513633728, "step": 9 }, { "debug/policy_chosen_logits": -3.3743577003479004, "debug/policy_chosen_logps": -182.16360473632812, "debug/policy_rejected_logits": -2.9649839401245117, "debug/policy_rejected_logps": -173.23670959472656, "debug/reference_chosen_logps": -190.21826171875, "debug/reference_rejected_logps": -162.2075958251953, "epoch": 0.19607843137254902, "grad_norm": 6.364236944929035, "learning_rate": 1e-06, "logits/chosen": -3.3743577003479004, "logits/rejected": -2.9649839401245117, "logps/chosen": -182.16360473632812, "logps/rejected": -173.23670959472656, "loss": 0.3573, "rewards/accuracies": 1.0, "rewards/chosen": 0.08054651319980621, "rewards/margins": 0.19083772599697113, "rewards/rejected": -0.11029121279716492, "step": 10 }, { "debug/policy_chosen_logits": -3.173790454864502, "debug/policy_chosen_logps": -203.52264404296875, "debug/policy_rejected_logits": -2.949070692062378, "debug/policy_rejected_logps": -178.64117431640625, "debug/reference_chosen_logps": -212.055908203125, "debug/reference_rejected_logps": -169.71719360351562, "epoch": 0.21568627450980393, "grad_norm": 5.353556279413172, "learning_rate": 1e-06, "logits/chosen": -3.173790454864502, "logits/rejected": -2.949070692062378, "logps/chosen": -203.52264404296875, "logps/rejected": -178.64117431640625, "loss": 0.3634, "rewards/accuracies": 0.875, "rewards/chosen": 0.08533257991075516, "rewards/margins": 0.1745723932981491, "rewards/rejected": -0.08923980593681335, "step": 11 }, { "debug/policy_chosen_logits": -3.277130126953125, "debug/policy_chosen_logps": -199.06358337402344, "debug/policy_rejected_logits": -3.132725954055786, "debug/policy_rejected_logps": -179.83506774902344, "debug/reference_chosen_logps": -208.1540985107422, "debug/reference_rejected_logps": -177.0548095703125, "epoch": 0.23529411764705882, "grad_norm": 5.672030188679155, "learning_rate": 1e-06, "logits/chosen": -3.277130126953125, "logits/rejected": -3.132725954055786, "logps/chosen": -199.06358337402344, "logps/rejected": -179.83506774902344, "loss": 0.3502, "rewards/accuracies": 0.875, "rewards/chosen": 0.09090512990951538, "rewards/margins": 0.11870768666267395, "rewards/rejected": -0.027802541851997375, "step": 12 }, { "debug/policy_chosen_logits": -3.1242167949676514, "debug/policy_chosen_logps": -191.25042724609375, "debug/policy_rejected_logits": -2.9863691329956055, "debug/policy_rejected_logps": -175.88369750976562, "debug/reference_chosen_logps": -196.898193359375, "debug/reference_rejected_logps": -166.8399658203125, "epoch": 0.2549019607843137, "grad_norm": 5.205320381513838, "learning_rate": 1e-06, "logits/chosen": -3.1242167949676514, "logits/rejected": -2.9863691329956055, "logps/chosen": -191.25042724609375, "logps/rejected": -175.88369750976562, "loss": 0.37, "rewards/accuracies": 0.75, "rewards/chosen": 0.056477658450603485, "rewards/margins": 0.14691495895385742, "rewards/rejected": -0.09043729305267334, "step": 13 }, { "debug/policy_chosen_logits": -3.144406318664551, "debug/policy_chosen_logps": -218.8365478515625, "debug/policy_rejected_logits": -2.8921687602996826, "debug/policy_rejected_logps": -180.90426635742188, "debug/reference_chosen_logps": -229.2056884765625, "debug/reference_rejected_logps": -170.25051879882812, "epoch": 0.27450980392156865, "grad_norm": 5.1865412600469725, "learning_rate": 1e-06, "logits/chosen": -3.144406318664551, "logits/rejected": -2.8921687602996826, "logps/chosen": -218.8365478515625, "logps/rejected": -180.90426635742188, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 0.10369150340557098, "rewards/margins": 0.21022894978523254, "rewards/rejected": -0.10653746128082275, "step": 14 }, { "debug/policy_chosen_logits": -3.2248146533966064, "debug/policy_chosen_logps": -201.5963134765625, "debug/policy_rejected_logits": -2.995159149169922, "debug/policy_rejected_logps": -184.41653442382812, "debug/reference_chosen_logps": -210.53994750976562, "debug/reference_rejected_logps": -177.02459716796875, "epoch": 0.29411764705882354, "grad_norm": 5.281953187102778, "learning_rate": 1e-06, "logits/chosen": -3.2248146533966064, "logits/rejected": -2.995159149169922, "logps/chosen": -201.5963134765625, "logps/rejected": -184.41653442382812, "loss": 0.3298, "rewards/accuracies": 0.75, "rewards/chosen": 0.08943626284599304, "rewards/margins": 0.163355752825737, "rewards/rejected": -0.07391948252916336, "step": 15 }, { "debug/policy_chosen_logits": -3.152334213256836, "debug/policy_chosen_logps": -203.98538208007812, "debug/policy_rejected_logits": -3.0059425830841064, "debug/policy_rejected_logps": -174.17031860351562, "debug/reference_chosen_logps": -212.34693908691406, "debug/reference_rejected_logps": -161.72183227539062, "epoch": 0.3137254901960784, "grad_norm": 4.990441768681764, "learning_rate": 1e-06, "logits/chosen": -3.152334213256836, "logits/rejected": -3.0059425830841064, "logps/chosen": -203.98538208007812, "logps/rejected": -174.17031860351562, "loss": 0.3315, "rewards/accuracies": 1.0, "rewards/chosen": 0.08361560851335526, "rewards/margins": 0.2081003040075302, "rewards/rejected": -0.12448470294475555, "step": 16 }, { "debug/policy_chosen_logits": -3.3237087726593018, "debug/policy_chosen_logps": -197.6831512451172, "debug/policy_rejected_logits": -3.1152756214141846, "debug/policy_rejected_logps": -185.88690185546875, "debug/reference_chosen_logps": -209.38027954101562, "debug/reference_rejected_logps": -171.8429412841797, "epoch": 0.3333333333333333, "grad_norm": 3.847170282499073, "learning_rate": 1e-06, "logits/chosen": -3.3237087726593018, "logits/rejected": -3.1152756214141846, "logps/chosen": -197.6831512451172, "logps/rejected": -185.88690185546875, "loss": 0.3262, "rewards/accuracies": 0.625, "rewards/chosen": 0.11697111278772354, "rewards/margins": 0.25741061568260193, "rewards/rejected": -0.14043951034545898, "step": 17 }, { "debug/policy_chosen_logits": -3.423741102218628, "debug/policy_chosen_logps": -197.50125122070312, "debug/policy_rejected_logits": -3.0541563034057617, "debug/policy_rejected_logps": -194.04254150390625, "debug/reference_chosen_logps": -214.143310546875, "debug/reference_rejected_logps": -176.38931274414062, "epoch": 0.35294117647058826, "grad_norm": 4.6640459530058145, "learning_rate": 1e-06, "logits/chosen": -3.423741102218628, "logits/rejected": -3.0541563034057617, "logps/chosen": -197.50125122070312, "logps/rejected": -194.04254150390625, "loss": 0.2745, "rewards/accuracies": 0.875, "rewards/chosen": 0.16642045974731445, "rewards/margins": 0.3429526686668396, "rewards/rejected": -0.17653217911720276, "step": 18 }, { "debug/policy_chosen_logits": -3.1585283279418945, "debug/policy_chosen_logps": -205.2724151611328, "debug/policy_rejected_logits": -2.9066219329833984, "debug/policy_rejected_logps": -190.3876953125, "debug/reference_chosen_logps": -227.2172088623047, "debug/reference_rejected_logps": -159.18490600585938, "epoch": 0.37254901960784315, "grad_norm": 4.373436851253389, "learning_rate": 1e-06, "logits/chosen": -3.1585283279418945, "logits/rejected": -2.9066219329833984, "logps/chosen": -205.2724151611328, "logps/rejected": -190.3876953125, "loss": 0.2051, "rewards/accuracies": 0.875, "rewards/chosen": 0.2194480001926422, "rewards/margins": 0.5314759016036987, "rewards/rejected": -0.3120279312133789, "step": 19 }, { "debug/policy_chosen_logits": -3.3925397396087646, "debug/policy_chosen_logps": -182.79222106933594, "debug/policy_rejected_logits": -2.9614052772521973, "debug/policy_rejected_logps": -192.5382843017578, "debug/reference_chosen_logps": -208.93617248535156, "debug/reference_rejected_logps": -170.37913513183594, "epoch": 0.39215686274509803, "grad_norm": 7.041418694876974, "learning_rate": 1e-06, "logits/chosen": -3.3925397396087646, "logits/rejected": -2.9614052772521973, "logps/chosen": -182.79222106933594, "logps/rejected": -192.5382843017578, "loss": 0.2688, "rewards/accuracies": 0.75, "rewards/chosen": 0.2614395320415497, "rewards/margins": 0.48303085565567017, "rewards/rejected": -0.22159132361412048, "step": 20 }, { "debug/policy_chosen_logits": -3.3081858158111572, "debug/policy_chosen_logps": -170.76687622070312, "debug/policy_rejected_logits": -2.9337339401245117, "debug/policy_rejected_logps": -190.08706665039062, "debug/reference_chosen_logps": -194.40255737304688, "debug/reference_rejected_logps": -161.28668212890625, "epoch": 0.4117647058823529, "grad_norm": 5.913800772738065, "learning_rate": 1e-06, "logits/chosen": -3.3081858158111572, "logits/rejected": -2.9337339401245117, "logps/chosen": -170.76687622070312, "logps/rejected": -190.08706665039062, "loss": 0.2346, "rewards/accuracies": 0.875, "rewards/chosen": 0.2363569736480713, "rewards/margins": 0.5243606567382812, "rewards/rejected": -0.28800368309020996, "step": 21 }, { "debug/policy_chosen_logits": -3.262423276901245, "debug/policy_chosen_logps": -174.46621704101562, "debug/policy_rejected_logits": -2.9751062393188477, "debug/policy_rejected_logps": -194.79879760742188, "debug/reference_chosen_logps": -195.777587890625, "debug/reference_rejected_logps": -166.50228881835938, "epoch": 0.43137254901960786, "grad_norm": 3.6878540395392996, "learning_rate": 1e-06, "logits/chosen": -3.262423276901245, "logits/rejected": -2.9751062393188477, "logps/chosen": -174.46621704101562, "logps/rejected": -194.79879760742188, "loss": 0.1804, "rewards/accuracies": 0.75, "rewards/chosen": 0.21311378479003906, "rewards/margins": 0.4960786700248718, "rewards/rejected": -0.28296488523483276, "step": 22 }, { "debug/policy_chosen_logits": -3.284288167953491, "debug/policy_chosen_logps": -201.29129028320312, "debug/policy_rejected_logits": -3.055243730545044, "debug/policy_rejected_logps": -189.35101318359375, "debug/reference_chosen_logps": -227.23031616210938, "debug/reference_rejected_logps": -170.58262634277344, "epoch": 0.45098039215686275, "grad_norm": 3.4160494981935057, "learning_rate": 1e-06, "logits/chosen": -3.284288167953491, "logits/rejected": -3.055243730545044, "logps/chosen": -201.29129028320312, "logps/rejected": -189.35101318359375, "loss": 0.2467, "rewards/accuracies": 0.75, "rewards/chosen": 0.2593901753425598, "rewards/margins": 0.44707390666007996, "rewards/rejected": -0.18768377602100372, "step": 23 }, { "debug/policy_chosen_logits": -3.3582632541656494, "debug/policy_chosen_logps": -180.80130004882812, "debug/policy_rejected_logits": -2.9310052394866943, "debug/policy_rejected_logps": -198.58038330078125, "debug/reference_chosen_logps": -208.0858917236328, "debug/reference_rejected_logps": -156.55642700195312, "epoch": 0.47058823529411764, "grad_norm": 4.3584453143571515, "learning_rate": 1e-06, "logits/chosen": -3.3582632541656494, "logits/rejected": -2.9310052394866943, "logps/chosen": -180.80130004882812, "logps/rejected": -198.58038330078125, "loss": 0.1894, "rewards/accuracies": 0.875, "rewards/chosen": 0.2728460729122162, "rewards/margins": 0.6930855512619019, "rewards/rejected": -0.42023950815200806, "step": 24 }, { "debug/policy_chosen_logits": -3.294647455215454, "debug/policy_chosen_logps": -200.66555786132812, "debug/policy_rejected_logits": -2.9623827934265137, "debug/policy_rejected_logps": -209.60763549804688, "debug/reference_chosen_logps": -226.37498474121094, "debug/reference_rejected_logps": -175.77857971191406, "epoch": 0.49019607843137253, "grad_norm": 5.190786042631128, "learning_rate": 1e-06, "logits/chosen": -3.294647455215454, "logits/rejected": -2.9623827934265137, "logps/chosen": -200.66555786132812, "logps/rejected": -209.60763549804688, "loss": 0.2067, "rewards/accuracies": 0.875, "rewards/chosen": 0.257094144821167, "rewards/margins": 0.5953845977783203, "rewards/rejected": -0.3382904529571533, "step": 25 }, { "debug/policy_chosen_logits": -3.303532123565674, "debug/policy_chosen_logps": -172.21371459960938, "debug/policy_rejected_logits": -3.037935495376587, "debug/policy_rejected_logps": -192.28829956054688, "debug/reference_chosen_logps": -197.53707885742188, "debug/reference_rejected_logps": -173.1179656982422, "epoch": 0.5098039215686274, "grad_norm": 3.424028522129365, "learning_rate": 1e-06, "logits/chosen": -3.303532123565674, "logits/rejected": -3.037935495376587, "logps/chosen": -172.21371459960938, "logps/rejected": -192.28829956054688, "loss": 0.2072, "rewards/accuracies": 0.75, "rewards/chosen": 0.25323352217674255, "rewards/margins": 0.44493675231933594, "rewards/rejected": -0.19170325994491577, "step": 26 }, { "debug/policy_chosen_logits": -3.2078473567962646, "debug/policy_chosen_logps": -185.5386199951172, "debug/policy_rejected_logits": -3.058324098587036, "debug/policy_rejected_logps": -181.49124145507812, "debug/reference_chosen_logps": -216.10214233398438, "debug/reference_rejected_logps": -166.07806396484375, "epoch": 0.5294117647058824, "grad_norm": 3.2542715272608427, "learning_rate": 1e-06, "logits/chosen": -3.2078473567962646, "logits/rejected": -3.058324098587036, "logps/chosen": -185.5386199951172, "logps/rejected": -181.49124145507812, "loss": 0.1898, "rewards/accuracies": 0.75, "rewards/chosen": 0.3056352138519287, "rewards/margins": 0.45976707339286804, "rewards/rejected": -0.15413185954093933, "step": 27 }, { "debug/policy_chosen_logits": -3.1834847927093506, "debug/policy_chosen_logps": -189.82064819335938, "debug/policy_rejected_logits": -3.0259480476379395, "debug/policy_rejected_logps": -192.5255126953125, "debug/reference_chosen_logps": -225.47145080566406, "debug/reference_rejected_logps": -174.45132446289062, "epoch": 0.5490196078431373, "grad_norm": 5.011972117551199, "learning_rate": 1e-06, "logits/chosen": -3.1834847927093506, "logits/rejected": -3.0259480476379395, "logps/chosen": -189.82064819335938, "logps/rejected": -192.5255126953125, "loss": 0.2206, "rewards/accuracies": 0.75, "rewards/chosen": 0.3565079867839813, "rewards/margins": 0.5372498631477356, "rewards/rejected": -0.18074187636375427, "step": 28 }, { "debug/policy_chosen_logits": -3.439246416091919, "debug/policy_chosen_logps": -161.62721252441406, "debug/policy_rejected_logits": -2.9180073738098145, "debug/policy_rejected_logps": -201.15945434570312, "debug/reference_chosen_logps": -197.59341430664062, "debug/reference_rejected_logps": -162.81692504882812, "epoch": 0.5686274509803921, "grad_norm": 2.4188165120482044, "learning_rate": 1e-06, "logits/chosen": -3.439246416091919, "logits/rejected": -2.9180073738098145, "logps/chosen": -161.62721252441406, "logps/rejected": -201.15945434570312, "loss": 0.1577, "rewards/accuracies": 0.875, "rewards/chosen": 0.35966184735298157, "rewards/margins": 0.7430870532989502, "rewards/rejected": -0.38342520594596863, "step": 29 }, { "debug/policy_chosen_logits": -3.3921971321105957, "debug/policy_chosen_logps": -167.86477661132812, "debug/policy_rejected_logits": -3.1083662509918213, "debug/policy_rejected_logps": -186.5076141357422, "debug/reference_chosen_logps": -202.75399780273438, "debug/reference_rejected_logps": -165.93167114257812, "epoch": 0.5882352941176471, "grad_norm": 5.111153930194959, "learning_rate": 1e-06, "logits/chosen": -3.3921971321105957, "logits/rejected": -3.1083662509918213, "logps/chosen": -167.86477661132812, "logps/rejected": -186.5076141357422, "loss": 0.2254, "rewards/accuracies": 0.875, "rewards/chosen": 0.3488922417163849, "rewards/margins": 0.5546516180038452, "rewards/rejected": -0.20575937628746033, "step": 30 }, { "debug/policy_chosen_logits": -3.296841621398926, "debug/policy_chosen_logps": -186.3404541015625, "debug/policy_rejected_logits": -3.0427982807159424, "debug/policy_rejected_logps": -189.97247314453125, "debug/reference_chosen_logps": -225.02894592285156, "debug/reference_rejected_logps": -176.0540771484375, "epoch": 0.6078431372549019, "grad_norm": 5.753832158358882, "learning_rate": 1e-06, "logits/chosen": -3.296841621398926, "logits/rejected": -3.0427982807159424, "logps/chosen": -186.3404541015625, "logps/rejected": -189.97247314453125, "loss": 0.2514, "rewards/accuracies": 1.0, "rewards/chosen": 0.3868849575519562, "rewards/margins": 0.5260686874389648, "rewards/rejected": -0.13918372988700867, "step": 31 }, { "debug/policy_chosen_logits": -3.241013765335083, "debug/policy_chosen_logps": -183.28062438964844, "debug/policy_rejected_logits": -3.004754066467285, "debug/policy_rejected_logps": -206.055419921875, "debug/reference_chosen_logps": -215.31048583984375, "debug/reference_rejected_logps": -162.4297637939453, "epoch": 0.6274509803921569, "grad_norm": 2.9239290358904837, "learning_rate": 1e-06, "logits/chosen": -3.241013765335083, "logits/rejected": -3.004754066467285, "logps/chosen": -183.28062438964844, "logps/rejected": -206.055419921875, "loss": 0.1757, "rewards/accuracies": 1.0, "rewards/chosen": 0.32029855251312256, "rewards/margins": 0.7565551996231079, "rewards/rejected": -0.43625661730766296, "step": 32 }, { "debug/policy_chosen_logits": -3.3382985591888428, "debug/policy_chosen_logps": -180.47259521484375, "debug/policy_rejected_logits": -3.019533395767212, "debug/policy_rejected_logps": -206.90989685058594, "debug/reference_chosen_logps": -214.73683166503906, "debug/reference_rejected_logps": -155.3048095703125, "epoch": 0.6470588235294118, "grad_norm": 3.2393949291557003, "learning_rate": 1e-06, "logits/chosen": -3.3382985591888428, "logits/rejected": -3.019533395767212, "logps/chosen": -180.47259521484375, "logps/rejected": -206.90989685058594, "loss": 0.2186, "rewards/accuracies": 1.0, "rewards/chosen": 0.3426423668861389, "rewards/margins": 0.8586931228637695, "rewards/rejected": -0.5160508155822754, "step": 33 }, { "debug/policy_chosen_logits": -3.25339412689209, "debug/policy_chosen_logps": -177.0988006591797, "debug/policy_rejected_logits": -2.986337900161743, "debug/policy_rejected_logps": -201.41714477539062, "debug/reference_chosen_logps": -204.36630249023438, "debug/reference_rejected_logps": -173.47714233398438, "epoch": 0.6666666666666666, "grad_norm": 4.241231231386345, "learning_rate": 1e-06, "logits/chosen": -3.25339412689209, "logits/rejected": -2.986337900161743, "logps/chosen": -177.0988006591797, "logps/rejected": -201.41714477539062, "loss": 0.1937, "rewards/accuracies": 0.875, "rewards/chosen": 0.2726749777793884, "rewards/margins": 0.5520750284194946, "rewards/rejected": -0.2794000804424286, "step": 34 }, { "debug/policy_chosen_logits": -3.251969337463379, "debug/policy_chosen_logps": -190.11709594726562, "debug/policy_rejected_logits": -3.0741655826568604, "debug/policy_rejected_logps": -194.2032470703125, "debug/reference_chosen_logps": -217.17962646484375, "debug/reference_rejected_logps": -160.67848205566406, "epoch": 0.6862745098039216, "grad_norm": 4.744228039712321, "learning_rate": 1e-06, "logits/chosen": -3.251969337463379, "logits/rejected": -3.0741655826568604, "logps/chosen": -190.11709594726562, "logps/rejected": -194.2032470703125, "loss": 0.1497, "rewards/accuracies": 0.75, "rewards/chosen": 0.2706252336502075, "rewards/margins": 0.6058727502822876, "rewards/rejected": -0.3352475166320801, "step": 35 }, { "debug/policy_chosen_logits": -3.3483757972717285, "debug/policy_chosen_logps": -179.4599609375, "debug/policy_rejected_logits": -3.1370773315429688, "debug/policy_rejected_logps": -190.57760620117188, "debug/reference_chosen_logps": -211.63006591796875, "debug/reference_rejected_logps": -169.59912109375, "epoch": 0.7058823529411765, "grad_norm": 3.186431463787634, "learning_rate": 1e-06, "logits/chosen": -3.3483757972717285, "logits/rejected": -3.1370773315429688, "logps/chosen": -179.4599609375, "logps/rejected": -190.57760620117188, "loss": 0.2442, "rewards/accuracies": 0.75, "rewards/chosen": 0.32170116901397705, "rewards/margins": 0.531485915184021, "rewards/rejected": -0.20978482067584991, "step": 36 }, { "debug/policy_chosen_logits": -3.2231926918029785, "debug/policy_chosen_logps": -172.751220703125, "debug/policy_rejected_logits": -3.042171001434326, "debug/policy_rejected_logps": -192.6194610595703, "debug/reference_chosen_logps": -204.07984924316406, "debug/reference_rejected_logps": -162.96307373046875, "epoch": 0.7254901960784313, "grad_norm": 3.76917946103672, "learning_rate": 1e-06, "logits/chosen": -3.2231926918029785, "logits/rejected": -3.042171001434326, "logps/chosen": -172.751220703125, "logps/rejected": -192.6194610595703, "loss": 0.222, "rewards/accuracies": 1.0, "rewards/chosen": 0.31328636407852173, "rewards/margins": 0.609850287437439, "rewards/rejected": -0.29656392335891724, "step": 37 }, { "debug/policy_chosen_logits": -3.370556116104126, "debug/policy_chosen_logps": -179.37933349609375, "debug/policy_rejected_logits": -2.9038238525390625, "debug/policy_rejected_logps": -197.517333984375, "debug/reference_chosen_logps": -211.8248291015625, "debug/reference_rejected_logps": -159.6433563232422, "epoch": 0.7450980392156863, "grad_norm": 3.7765583713304034, "learning_rate": 1e-06, "logits/chosen": -3.370556116104126, "logits/rejected": -2.9038238525390625, "logps/chosen": -179.37933349609375, "logps/rejected": -197.517333984375, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.3244548439979553, "rewards/margins": 0.7031944990158081, "rewards/rejected": -0.3787396550178528, "step": 38 }, { "debug/policy_chosen_logits": -3.253465414047241, "debug/policy_chosen_logps": -188.2485809326172, "debug/policy_rejected_logits": -3.051933526992798, "debug/policy_rejected_logps": -186.41270446777344, "debug/reference_chosen_logps": -218.99688720703125, "debug/reference_rejected_logps": -167.86203002929688, "epoch": 0.7647058823529411, "grad_norm": 3.533915411617125, "learning_rate": 1e-06, "logits/chosen": -3.253465414047241, "logits/rejected": -3.051933526992798, "logps/chosen": -188.2485809326172, "logps/rejected": -186.41270446777344, "loss": 0.2102, "rewards/accuracies": 0.875, "rewards/chosen": 0.3074829876422882, "rewards/margins": 0.492989718914032, "rewards/rejected": -0.18550674617290497, "step": 39 }, { "debug/policy_chosen_logits": -3.397189140319824, "debug/policy_chosen_logps": -184.84124755859375, "debug/policy_rejected_logits": -2.951024055480957, "debug/policy_rejected_logps": -205.2071533203125, "debug/reference_chosen_logps": -216.28897094726562, "debug/reference_rejected_logps": -167.5351104736328, "epoch": 0.7843137254901961, "grad_norm": 4.343119675678065, "learning_rate": 1e-06, "logits/chosen": -3.397189140319824, "logits/rejected": -2.951024055480957, "logps/chosen": -184.84124755859375, "logps/rejected": -205.2071533203125, "loss": 0.1484, "rewards/accuracies": 0.875, "rewards/chosen": 0.3144773840904236, "rewards/margins": 0.6911977529525757, "rewards/rejected": -0.3767203688621521, "step": 40 }, { "debug/policy_chosen_logits": -3.4680333137512207, "debug/policy_chosen_logps": -174.45204162597656, "debug/policy_rejected_logits": -3.177816867828369, "debug/policy_rejected_logps": -200.5006103515625, "debug/reference_chosen_logps": -201.8401641845703, "debug/reference_rejected_logps": -168.33837890625, "epoch": 0.803921568627451, "grad_norm": 4.7398844072134, "learning_rate": 1e-06, "logits/chosen": -3.4680333137512207, "logits/rejected": -3.177816867828369, "logps/chosen": -174.45204162597656, "logps/rejected": -200.5006103515625, "loss": 0.1898, "rewards/accuracies": 1.0, "rewards/chosen": 0.273881196975708, "rewards/margins": 0.595503568649292, "rewards/rejected": -0.3216223418712616, "step": 41 }, { "debug/policy_chosen_logits": -3.3945956230163574, "debug/policy_chosen_logps": -172.79193115234375, "debug/policy_rejected_logits": -3.0443050861358643, "debug/policy_rejected_logps": -194.8880615234375, "debug/reference_chosen_logps": -198.82882690429688, "debug/reference_rejected_logps": -170.7233123779297, "epoch": 0.8235294117647058, "grad_norm": 5.166045026790051, "learning_rate": 1e-06, "logits/chosen": -3.3945956230163574, "logits/rejected": -3.0443050861358643, "logps/chosen": -172.79193115234375, "logps/rejected": -194.8880615234375, "loss": 0.1759, "rewards/accuracies": 1.0, "rewards/chosen": 0.26036909222602844, "rewards/margins": 0.5020167827606201, "rewards/rejected": -0.24164767563343048, "step": 42 }, { "debug/policy_chosen_logits": -3.366682291030884, "debug/policy_chosen_logps": -174.59375, "debug/policy_rejected_logits": -3.141162157058716, "debug/policy_rejected_logps": -200.96633911132812, "debug/reference_chosen_logps": -205.5618438720703, "debug/reference_rejected_logps": -192.6947021484375, "epoch": 0.8431372549019608, "grad_norm": 3.316025693224064, "learning_rate": 1e-06, "logits/chosen": -3.366682291030884, "logits/rejected": -3.141162157058716, "logps/chosen": -174.59375, "logps/rejected": -200.96633911132812, "loss": 0.1598, "rewards/accuracies": 0.875, "rewards/chosen": 0.3096809387207031, "rewards/margins": 0.3923972249031067, "rewards/rejected": -0.08271628618240356, "step": 43 }, { "debug/policy_chosen_logits": -3.4701032638549805, "debug/policy_chosen_logps": -172.69961547851562, "debug/policy_rejected_logits": -3.0816352367401123, "debug/policy_rejected_logps": -192.14527893066406, "debug/reference_chosen_logps": -203.4981231689453, "debug/reference_rejected_logps": -165.00967407226562, "epoch": 0.8627450980392157, "grad_norm": 5.732457245614043, "learning_rate": 1e-06, "logits/chosen": -3.4701032638549805, "logits/rejected": -3.0816352367401123, "logps/chosen": -172.69961547851562, "logps/rejected": -192.14527893066406, "loss": 0.2551, "rewards/accuracies": 0.875, "rewards/chosen": 0.30798494815826416, "rewards/margins": 0.5793408155441284, "rewards/rejected": -0.27135586738586426, "step": 44 }, { "debug/policy_chosen_logits": -3.3291234970092773, "debug/policy_chosen_logps": -170.3388671875, "debug/policy_rejected_logits": -2.937793493270874, "debug/policy_rejected_logps": -192.89581298828125, "debug/reference_chosen_logps": -202.19802856445312, "debug/reference_rejected_logps": -159.46893310546875, "epoch": 0.8823529411764706, "grad_norm": 5.771805579285295, "learning_rate": 1e-06, "logits/chosen": -3.3291234970092773, "logits/rejected": -2.937793493270874, "logps/chosen": -170.3388671875, "logps/rejected": -192.89581298828125, "loss": 0.2151, "rewards/accuracies": 0.875, "rewards/chosen": 0.31859153509140015, "rewards/margins": 0.6528602242469788, "rewards/rejected": -0.334268718957901, "step": 45 }, { "debug/policy_chosen_logits": -3.226428270339966, "debug/policy_chosen_logps": -187.73739624023438, "debug/policy_rejected_logits": -2.986654043197632, "debug/policy_rejected_logps": -192.33885192871094, "debug/reference_chosen_logps": -220.32342529296875, "debug/reference_rejected_logps": -167.2851104736328, "epoch": 0.9019607843137255, "grad_norm": 2.8823875670773695, "learning_rate": 1e-06, "logits/chosen": -3.226428270339966, "logits/rejected": -2.986654043197632, "logps/chosen": -187.73739624023438, "logps/rejected": -192.33885192871094, "loss": 0.1605, "rewards/accuracies": 0.875, "rewards/chosen": 0.32586026191711426, "rewards/margins": 0.5763977766036987, "rewards/rejected": -0.2505374550819397, "step": 46 }, { "debug/policy_chosen_logits": -3.5862808227539062, "debug/policy_chosen_logps": -182.93630981445312, "debug/policy_rejected_logits": -3.0501527786254883, "debug/policy_rejected_logps": -197.54100036621094, "debug/reference_chosen_logps": -213.4707794189453, "debug/reference_rejected_logps": -172.90322875976562, "epoch": 0.9215686274509803, "grad_norm": 2.720609192674643, "learning_rate": 1e-06, "logits/chosen": -3.5862808227539062, "logits/rejected": -3.0501527786254883, "logps/chosen": -182.93630981445312, "logps/rejected": -197.54100036621094, "loss": 0.1686, "rewards/accuracies": 1.0, "rewards/chosen": 0.30534470081329346, "rewards/margins": 0.5517222881317139, "rewards/rejected": -0.2463776171207428, "step": 47 }, { "debug/policy_chosen_logits": -3.3840320110321045, "debug/policy_chosen_logps": -175.84884643554688, "debug/policy_rejected_logits": -3.043017625808716, "debug/policy_rejected_logps": -202.57257080078125, "debug/reference_chosen_logps": -203.88320922851562, "debug/reference_rejected_logps": -172.34735107421875, "epoch": 0.9411764705882353, "grad_norm": 2.309188784503618, "learning_rate": 1e-06, "logits/chosen": -3.3840320110321045, "logits/rejected": -3.043017625808716, "logps/chosen": -175.84884643554688, "logps/rejected": -202.57257080078125, "loss": 0.1666, "rewards/accuracies": 1.0, "rewards/chosen": 0.28034350275993347, "rewards/margins": 0.5825955271720886, "rewards/rejected": -0.30225205421447754, "step": 48 }, { "debug/policy_chosen_logits": -3.3747286796569824, "debug/policy_chosen_logps": -185.90341186523438, "debug/policy_rejected_logits": -3.0797741413116455, "debug/policy_rejected_logps": -202.92413330078125, "debug/reference_chosen_logps": -221.17340087890625, "debug/reference_rejected_logps": -154.19923400878906, "epoch": 0.9607843137254902, "grad_norm": 2.7307344297698157, "learning_rate": 1e-06, "logits/chosen": -3.3747286796569824, "logits/rejected": -3.0797741413116455, "logps/chosen": -185.90341186523438, "logps/rejected": -202.92413330078125, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 0.3527000844478607, "rewards/margins": 0.8399491310119629, "rewards/rejected": -0.48724907636642456, "step": 49 }, { "debug/policy_chosen_logits": -3.4781532287597656, "debug/policy_chosen_logps": -177.86842346191406, "debug/policy_rejected_logits": -3.0594165325164795, "debug/policy_rejected_logps": -195.89004516601562, "debug/reference_chosen_logps": -203.58407592773438, "debug/reference_rejected_logps": -161.39697265625, "epoch": 0.9803921568627451, "grad_norm": 3.3901490209041247, "learning_rate": 1e-06, "logits/chosen": -3.4781532287597656, "logits/rejected": -3.0594165325164795, "logps/chosen": -177.86842346191406, "logps/rejected": -195.89004516601562, "loss": 0.1895, "rewards/accuracies": 0.875, "rewards/chosen": 0.2571565508842468, "rewards/margins": 0.6020870208740234, "rewards/rejected": -0.3449305295944214, "step": 50 }, { "debug/policy_chosen_logits": -3.4821436405181885, "debug/policy_chosen_logps": -177.99000549316406, "debug/policy_rejected_logits": -3.0359578132629395, "debug/policy_rejected_logps": -203.94029235839844, "debug/reference_chosen_logps": -212.01348876953125, "debug/reference_rejected_logps": -166.06936645507812, "epoch": 1.0, "grad_norm": 3.0238129167739807, "learning_rate": 1e-06, "logits/chosen": -3.4821436405181885, "logits/rejected": -3.0359578132629395, "logps/chosen": -177.99000549316406, "logps/rejected": -203.94029235839844, "loss": 0.1383, "rewards/accuracies": 0.875, "rewards/chosen": 0.3402349352836609, "rewards/margins": 0.7189440727233887, "rewards/rejected": -0.37870916724205017, "step": 51 }, { "epoch": 1.0, "step": 51, "total_flos": 0.0, "train_loss": 0.26999635322421206, "train_runtime": 166.8574, "train_samples_per_second": 19.31, "train_steps_per_second": 0.306 } ], "logging_steps": 1, "max_steps": 51, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }