qwen2_coder_adamw_iter1 / trainer_state.json
yiran-wang3's picture
End of training
23a0146 verified
raw
history blame
44.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 51,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": -3.131476879119873,
"debug/policy_chosen_logps": -223.49798583984375,
"debug/policy_rejected_logits": -3.0218234062194824,
"debug/policy_rejected_logps": -181.94036865234375,
"debug/reference_chosen_logps": -223.49798583984375,
"debug/reference_rejected_logps": -181.94036865234375,
"epoch": 0.0196078431372549,
"grad_norm": 9.59268936350444,
"learning_rate": 1e-06,
"logits/chosen": -3.131476879119873,
"logits/rejected": -3.0218234062194824,
"logps/chosen": -223.49798583984375,
"logps/rejected": -181.94036865234375,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": -3.1443662643432617,
"debug/policy_chosen_logps": -209.216552734375,
"debug/policy_rejected_logits": -3.076768159866333,
"debug/policy_rejected_logps": -170.2884521484375,
"debug/reference_chosen_logps": -209.07872009277344,
"debug/reference_rejected_logps": -169.68731689453125,
"epoch": 0.0392156862745098,
"grad_norm": 8.71018223332819,
"learning_rate": 1e-06,
"logits/chosen": -3.1443662643432617,
"logits/rejected": -3.076768159866333,
"logps/chosen": -209.216552734375,
"logps/rejected": -170.2884521484375,
"loss": 0.4974,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.001378459855914116,
"rewards/margins": 0.004632873460650444,
"rewards/rejected": -0.00601133331656456,
"step": 2
},
{
"debug/policy_chosen_logits": -3.135432243347168,
"debug/policy_chosen_logps": -203.98123168945312,
"debug/policy_rejected_logits": -3.058173179626465,
"debug/policy_rejected_logps": -171.8382568359375,
"debug/reference_chosen_logps": -206.17086791992188,
"debug/reference_rejected_logps": -172.7147216796875,
"epoch": 0.058823529411764705,
"grad_norm": 11.448652871780954,
"learning_rate": 1e-06,
"logits/chosen": -3.135432243347168,
"logits/rejected": -3.058173179626465,
"logps/chosen": -203.98123168945312,
"logps/rejected": -171.8382568359375,
"loss": 0.4898,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02189634181559086,
"rewards/margins": 0.013131675310432911,
"rewards/rejected": 0.008764667436480522,
"step": 3
},
{
"debug/policy_chosen_logits": -3.0565543174743652,
"debug/policy_chosen_logps": -206.8490447998047,
"debug/policy_rejected_logits": -2.9550375938415527,
"debug/policy_rejected_logps": -174.601318359375,
"debug/reference_chosen_logps": -206.09422302246094,
"debug/reference_rejected_logps": -172.06332397460938,
"epoch": 0.0784313725490196,
"grad_norm": 8.681717474563778,
"learning_rate": 1e-06,
"logits/chosen": -3.0565543174743652,
"logits/rejected": -2.9550375938415527,
"logps/chosen": -206.8490447998047,
"logps/rejected": -174.601318359375,
"loss": 0.4855,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.007548102643340826,
"rewards/margins": 0.017831895500421524,
"rewards/rejected": -0.025379998609423637,
"step": 4
},
{
"debug/policy_chosen_logits": -3.0399742126464844,
"debug/policy_chosen_logps": -206.2752227783203,
"debug/policy_rejected_logits": -2.9232938289642334,
"debug/policy_rejected_logps": -156.70419311523438,
"debug/reference_chosen_logps": -205.56788635253906,
"debug/reference_rejected_logps": -150.99920654296875,
"epoch": 0.09803921568627451,
"grad_norm": 8.125347499138313,
"learning_rate": 1e-06,
"logits/chosen": -3.0399742126464844,
"logits/rejected": -2.9232938289642334,
"logps/chosen": -206.2752227783203,
"logps/rejected": -156.70419311523438,
"loss": 0.4648,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.00707334466278553,
"rewards/margins": 0.04997648298740387,
"rewards/rejected": -0.05704982578754425,
"step": 5
},
{
"debug/policy_chosen_logits": -3.2029356956481934,
"debug/policy_chosen_logps": -206.69122314453125,
"debug/policy_rejected_logits": -3.122507095336914,
"debug/policy_rejected_logps": -199.9852752685547,
"debug/reference_chosen_logps": -210.3363494873047,
"debug/reference_rejected_logps": -200.11654663085938,
"epoch": 0.11764705882352941,
"grad_norm": 10.6811806285762,
"learning_rate": 1e-06,
"logits/chosen": -3.2029356956481934,
"logits/rejected": -3.122507095336914,
"logps/chosen": -206.69122314453125,
"logps/rejected": -199.9852752685547,
"loss": 0.4523,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03645110875368118,
"rewards/margins": 0.03513820841908455,
"rewards/rejected": 0.0013129040598869324,
"step": 6
},
{
"debug/policy_chosen_logits": -3.202118396759033,
"debug/policy_chosen_logps": -198.13885498046875,
"debug/policy_rejected_logits": -2.9787750244140625,
"debug/policy_rejected_logps": -169.29666137695312,
"debug/reference_chosen_logps": -199.79531860351562,
"debug/reference_rejected_logps": -164.98037719726562,
"epoch": 0.13725490196078433,
"grad_norm": 7.511392268060962,
"learning_rate": 1e-06,
"logits/chosen": -3.202118396759033,
"logits/rejected": -2.9787750244140625,
"logps/chosen": -198.13885498046875,
"logps/rejected": -169.29666137695312,
"loss": 0.4439,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.01656484603881836,
"rewards/margins": 0.05972753465175629,
"rewards/rejected": -0.04316268861293793,
"step": 7
},
{
"debug/policy_chosen_logits": -3.1585052013397217,
"debug/policy_chosen_logps": -209.9014892578125,
"debug/policy_rejected_logits": -3.147646427154541,
"debug/policy_rejected_logps": -184.0531005859375,
"debug/reference_chosen_logps": -213.74742126464844,
"debug/reference_rejected_logps": -184.40182495117188,
"epoch": 0.1568627450980392,
"grad_norm": 9.384905399946895,
"learning_rate": 1e-06,
"logits/chosen": -3.1585052013397217,
"logits/rejected": -3.147646427154541,
"logps/chosen": -209.9014892578125,
"logps/rejected": -184.0531005859375,
"loss": 0.4553,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03845922276377678,
"rewards/margins": 0.03497195988893509,
"rewards/rejected": 0.00348726287484169,
"step": 8
},
{
"debug/policy_chosen_logits": -3.219722032546997,
"debug/policy_chosen_logps": -204.11474609375,
"debug/policy_rejected_logits": -3.0252864360809326,
"debug/policy_rejected_logps": -165.35894775390625,
"debug/reference_chosen_logps": -207.07034301757812,
"debug/reference_rejected_logps": -154.7605438232422,
"epoch": 0.17647058823529413,
"grad_norm": 6.128726988165846,
"learning_rate": 1e-06,
"logits/chosen": -3.219722032546997,
"logits/rejected": -3.0252864360809326,
"logps/chosen": -204.11474609375,
"logps/rejected": -165.35894775390625,
"loss": 0.4068,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02955583482980728,
"rewards/margins": 0.1355399787425995,
"rewards/rejected": -0.1059841513633728,
"step": 9
},
{
"debug/policy_chosen_logits": -3.3743577003479004,
"debug/policy_chosen_logps": -182.16360473632812,
"debug/policy_rejected_logits": -2.9649839401245117,
"debug/policy_rejected_logps": -173.23670959472656,
"debug/reference_chosen_logps": -190.21826171875,
"debug/reference_rejected_logps": -162.2075958251953,
"epoch": 0.19607843137254902,
"grad_norm": 6.364236944929035,
"learning_rate": 1e-06,
"logits/chosen": -3.3743577003479004,
"logits/rejected": -2.9649839401245117,
"logps/chosen": -182.16360473632812,
"logps/rejected": -173.23670959472656,
"loss": 0.3573,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08054651319980621,
"rewards/margins": 0.19083772599697113,
"rewards/rejected": -0.11029121279716492,
"step": 10
},
{
"debug/policy_chosen_logits": -3.173790454864502,
"debug/policy_chosen_logps": -203.52264404296875,
"debug/policy_rejected_logits": -2.949070692062378,
"debug/policy_rejected_logps": -178.64117431640625,
"debug/reference_chosen_logps": -212.055908203125,
"debug/reference_rejected_logps": -169.71719360351562,
"epoch": 0.21568627450980393,
"grad_norm": 5.353556279413172,
"learning_rate": 1e-06,
"logits/chosen": -3.173790454864502,
"logits/rejected": -2.949070692062378,
"logps/chosen": -203.52264404296875,
"logps/rejected": -178.64117431640625,
"loss": 0.3634,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.08533257991075516,
"rewards/margins": 0.1745723932981491,
"rewards/rejected": -0.08923980593681335,
"step": 11
},
{
"debug/policy_chosen_logits": -3.277130126953125,
"debug/policy_chosen_logps": -199.06358337402344,
"debug/policy_rejected_logits": -3.132725954055786,
"debug/policy_rejected_logps": -179.83506774902344,
"debug/reference_chosen_logps": -208.1540985107422,
"debug/reference_rejected_logps": -177.0548095703125,
"epoch": 0.23529411764705882,
"grad_norm": 5.672030188679155,
"learning_rate": 1e-06,
"logits/chosen": -3.277130126953125,
"logits/rejected": -3.132725954055786,
"logps/chosen": -199.06358337402344,
"logps/rejected": -179.83506774902344,
"loss": 0.3502,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.09090512990951538,
"rewards/margins": 0.11870768666267395,
"rewards/rejected": -0.027802541851997375,
"step": 12
},
{
"debug/policy_chosen_logits": -3.1242167949676514,
"debug/policy_chosen_logps": -191.25042724609375,
"debug/policy_rejected_logits": -2.9863691329956055,
"debug/policy_rejected_logps": -175.88369750976562,
"debug/reference_chosen_logps": -196.898193359375,
"debug/reference_rejected_logps": -166.8399658203125,
"epoch": 0.2549019607843137,
"grad_norm": 5.205320381513838,
"learning_rate": 1e-06,
"logits/chosen": -3.1242167949676514,
"logits/rejected": -2.9863691329956055,
"logps/chosen": -191.25042724609375,
"logps/rejected": -175.88369750976562,
"loss": 0.37,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.056477658450603485,
"rewards/margins": 0.14691495895385742,
"rewards/rejected": -0.09043729305267334,
"step": 13
},
{
"debug/policy_chosen_logits": -3.144406318664551,
"debug/policy_chosen_logps": -218.8365478515625,
"debug/policy_rejected_logits": -2.8921687602996826,
"debug/policy_rejected_logps": -180.90426635742188,
"debug/reference_chosen_logps": -229.2056884765625,
"debug/reference_rejected_logps": -170.25051879882812,
"epoch": 0.27450980392156865,
"grad_norm": 5.1865412600469725,
"learning_rate": 1e-06,
"logits/chosen": -3.144406318664551,
"logits/rejected": -2.8921687602996826,
"logps/chosen": -218.8365478515625,
"logps/rejected": -180.90426635742188,
"loss": 0.3456,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10369150340557098,
"rewards/margins": 0.21022894978523254,
"rewards/rejected": -0.10653746128082275,
"step": 14
},
{
"debug/policy_chosen_logits": -3.2248146533966064,
"debug/policy_chosen_logps": -201.5963134765625,
"debug/policy_rejected_logits": -2.995159149169922,
"debug/policy_rejected_logps": -184.41653442382812,
"debug/reference_chosen_logps": -210.53994750976562,
"debug/reference_rejected_logps": -177.02459716796875,
"epoch": 0.29411764705882354,
"grad_norm": 5.281953187102778,
"learning_rate": 1e-06,
"logits/chosen": -3.2248146533966064,
"logits/rejected": -2.995159149169922,
"logps/chosen": -201.5963134765625,
"logps/rejected": -184.41653442382812,
"loss": 0.3298,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.08943626284599304,
"rewards/margins": 0.163355752825737,
"rewards/rejected": -0.07391948252916336,
"step": 15
},
{
"debug/policy_chosen_logits": -3.152334213256836,
"debug/policy_chosen_logps": -203.98538208007812,
"debug/policy_rejected_logits": -3.0059425830841064,
"debug/policy_rejected_logps": -174.17031860351562,
"debug/reference_chosen_logps": -212.34693908691406,
"debug/reference_rejected_logps": -161.72183227539062,
"epoch": 0.3137254901960784,
"grad_norm": 4.990441768681764,
"learning_rate": 1e-06,
"logits/chosen": -3.152334213256836,
"logits/rejected": -3.0059425830841064,
"logps/chosen": -203.98538208007812,
"logps/rejected": -174.17031860351562,
"loss": 0.3315,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08361560851335526,
"rewards/margins": 0.2081003040075302,
"rewards/rejected": -0.12448470294475555,
"step": 16
},
{
"debug/policy_chosen_logits": -3.3237087726593018,
"debug/policy_chosen_logps": -197.6831512451172,
"debug/policy_rejected_logits": -3.1152756214141846,
"debug/policy_rejected_logps": -185.88690185546875,
"debug/reference_chosen_logps": -209.38027954101562,
"debug/reference_rejected_logps": -171.8429412841797,
"epoch": 0.3333333333333333,
"grad_norm": 3.847170282499073,
"learning_rate": 1e-06,
"logits/chosen": -3.3237087726593018,
"logits/rejected": -3.1152756214141846,
"logps/chosen": -197.6831512451172,
"logps/rejected": -185.88690185546875,
"loss": 0.3262,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.11697111278772354,
"rewards/margins": 0.25741061568260193,
"rewards/rejected": -0.14043951034545898,
"step": 17
},
{
"debug/policy_chosen_logits": -3.423741102218628,
"debug/policy_chosen_logps": -197.50125122070312,
"debug/policy_rejected_logits": -3.0541563034057617,
"debug/policy_rejected_logps": -194.04254150390625,
"debug/reference_chosen_logps": -214.143310546875,
"debug/reference_rejected_logps": -176.38931274414062,
"epoch": 0.35294117647058826,
"grad_norm": 4.6640459530058145,
"learning_rate": 1e-06,
"logits/chosen": -3.423741102218628,
"logits/rejected": -3.0541563034057617,
"logps/chosen": -197.50125122070312,
"logps/rejected": -194.04254150390625,
"loss": 0.2745,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.16642045974731445,
"rewards/margins": 0.3429526686668396,
"rewards/rejected": -0.17653217911720276,
"step": 18
},
{
"debug/policy_chosen_logits": -3.1585283279418945,
"debug/policy_chosen_logps": -205.2724151611328,
"debug/policy_rejected_logits": -2.9066219329833984,
"debug/policy_rejected_logps": -190.3876953125,
"debug/reference_chosen_logps": -227.2172088623047,
"debug/reference_rejected_logps": -159.18490600585938,
"epoch": 0.37254901960784315,
"grad_norm": 4.373436851253389,
"learning_rate": 1e-06,
"logits/chosen": -3.1585283279418945,
"logits/rejected": -2.9066219329833984,
"logps/chosen": -205.2724151611328,
"logps/rejected": -190.3876953125,
"loss": 0.2051,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2194480001926422,
"rewards/margins": 0.5314759016036987,
"rewards/rejected": -0.3120279312133789,
"step": 19
},
{
"debug/policy_chosen_logits": -3.3925397396087646,
"debug/policy_chosen_logps": -182.79222106933594,
"debug/policy_rejected_logits": -2.9614052772521973,
"debug/policy_rejected_logps": -192.5382843017578,
"debug/reference_chosen_logps": -208.93617248535156,
"debug/reference_rejected_logps": -170.37913513183594,
"epoch": 0.39215686274509803,
"grad_norm": 7.041418694876974,
"learning_rate": 1e-06,
"logits/chosen": -3.3925397396087646,
"logits/rejected": -2.9614052772521973,
"logps/chosen": -182.79222106933594,
"logps/rejected": -192.5382843017578,
"loss": 0.2688,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2614395320415497,
"rewards/margins": 0.48303085565567017,
"rewards/rejected": -0.22159132361412048,
"step": 20
},
{
"debug/policy_chosen_logits": -3.3081858158111572,
"debug/policy_chosen_logps": -170.76687622070312,
"debug/policy_rejected_logits": -2.9337339401245117,
"debug/policy_rejected_logps": -190.08706665039062,
"debug/reference_chosen_logps": -194.40255737304688,
"debug/reference_rejected_logps": -161.28668212890625,
"epoch": 0.4117647058823529,
"grad_norm": 5.913800772738065,
"learning_rate": 1e-06,
"logits/chosen": -3.3081858158111572,
"logits/rejected": -2.9337339401245117,
"logps/chosen": -170.76687622070312,
"logps/rejected": -190.08706665039062,
"loss": 0.2346,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2363569736480713,
"rewards/margins": 0.5243606567382812,
"rewards/rejected": -0.28800368309020996,
"step": 21
},
{
"debug/policy_chosen_logits": -3.262423276901245,
"debug/policy_chosen_logps": -174.46621704101562,
"debug/policy_rejected_logits": -2.9751062393188477,
"debug/policy_rejected_logps": -194.79879760742188,
"debug/reference_chosen_logps": -195.777587890625,
"debug/reference_rejected_logps": -166.50228881835938,
"epoch": 0.43137254901960786,
"grad_norm": 3.6878540395392996,
"learning_rate": 1e-06,
"logits/chosen": -3.262423276901245,
"logits/rejected": -2.9751062393188477,
"logps/chosen": -174.46621704101562,
"logps/rejected": -194.79879760742188,
"loss": 0.1804,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.21311378479003906,
"rewards/margins": 0.4960786700248718,
"rewards/rejected": -0.28296488523483276,
"step": 22
},
{
"debug/policy_chosen_logits": -3.284288167953491,
"debug/policy_chosen_logps": -201.29129028320312,
"debug/policy_rejected_logits": -3.055243730545044,
"debug/policy_rejected_logps": -189.35101318359375,
"debug/reference_chosen_logps": -227.23031616210938,
"debug/reference_rejected_logps": -170.58262634277344,
"epoch": 0.45098039215686275,
"grad_norm": 3.4160494981935057,
"learning_rate": 1e-06,
"logits/chosen": -3.284288167953491,
"logits/rejected": -3.055243730545044,
"logps/chosen": -201.29129028320312,
"logps/rejected": -189.35101318359375,
"loss": 0.2467,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2593901753425598,
"rewards/margins": 0.44707390666007996,
"rewards/rejected": -0.18768377602100372,
"step": 23
},
{
"debug/policy_chosen_logits": -3.3582632541656494,
"debug/policy_chosen_logps": -180.80130004882812,
"debug/policy_rejected_logits": -2.9310052394866943,
"debug/policy_rejected_logps": -198.58038330078125,
"debug/reference_chosen_logps": -208.0858917236328,
"debug/reference_rejected_logps": -156.55642700195312,
"epoch": 0.47058823529411764,
"grad_norm": 4.3584453143571515,
"learning_rate": 1e-06,
"logits/chosen": -3.3582632541656494,
"logits/rejected": -2.9310052394866943,
"logps/chosen": -180.80130004882812,
"logps/rejected": -198.58038330078125,
"loss": 0.1894,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2728460729122162,
"rewards/margins": 0.6930855512619019,
"rewards/rejected": -0.42023950815200806,
"step": 24
},
{
"debug/policy_chosen_logits": -3.294647455215454,
"debug/policy_chosen_logps": -200.66555786132812,
"debug/policy_rejected_logits": -2.9623827934265137,
"debug/policy_rejected_logps": -209.60763549804688,
"debug/reference_chosen_logps": -226.37498474121094,
"debug/reference_rejected_logps": -175.77857971191406,
"epoch": 0.49019607843137253,
"grad_norm": 5.190786042631128,
"learning_rate": 1e-06,
"logits/chosen": -3.294647455215454,
"logits/rejected": -2.9623827934265137,
"logps/chosen": -200.66555786132812,
"logps/rejected": -209.60763549804688,
"loss": 0.2067,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.257094144821167,
"rewards/margins": 0.5953845977783203,
"rewards/rejected": -0.3382904529571533,
"step": 25
},
{
"debug/policy_chosen_logits": -3.303532123565674,
"debug/policy_chosen_logps": -172.21371459960938,
"debug/policy_rejected_logits": -3.037935495376587,
"debug/policy_rejected_logps": -192.28829956054688,
"debug/reference_chosen_logps": -197.53707885742188,
"debug/reference_rejected_logps": -173.1179656982422,
"epoch": 0.5098039215686274,
"grad_norm": 3.424028522129365,
"learning_rate": 1e-06,
"logits/chosen": -3.303532123565674,
"logits/rejected": -3.037935495376587,
"logps/chosen": -172.21371459960938,
"logps/rejected": -192.28829956054688,
"loss": 0.2072,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.25323352217674255,
"rewards/margins": 0.44493675231933594,
"rewards/rejected": -0.19170325994491577,
"step": 26
},
{
"debug/policy_chosen_logits": -3.2078473567962646,
"debug/policy_chosen_logps": -185.5386199951172,
"debug/policy_rejected_logits": -3.058324098587036,
"debug/policy_rejected_logps": -181.49124145507812,
"debug/reference_chosen_logps": -216.10214233398438,
"debug/reference_rejected_logps": -166.07806396484375,
"epoch": 0.5294117647058824,
"grad_norm": 3.2542715272608427,
"learning_rate": 1e-06,
"logits/chosen": -3.2078473567962646,
"logits/rejected": -3.058324098587036,
"logps/chosen": -185.5386199951172,
"logps/rejected": -181.49124145507812,
"loss": 0.1898,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3056352138519287,
"rewards/margins": 0.45976707339286804,
"rewards/rejected": -0.15413185954093933,
"step": 27
},
{
"debug/policy_chosen_logits": -3.1834847927093506,
"debug/policy_chosen_logps": -189.82064819335938,
"debug/policy_rejected_logits": -3.0259480476379395,
"debug/policy_rejected_logps": -192.5255126953125,
"debug/reference_chosen_logps": -225.47145080566406,
"debug/reference_rejected_logps": -174.45132446289062,
"epoch": 0.5490196078431373,
"grad_norm": 5.011972117551199,
"learning_rate": 1e-06,
"logits/chosen": -3.1834847927093506,
"logits/rejected": -3.0259480476379395,
"logps/chosen": -189.82064819335938,
"logps/rejected": -192.5255126953125,
"loss": 0.2206,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3565079867839813,
"rewards/margins": 0.5372498631477356,
"rewards/rejected": -0.18074187636375427,
"step": 28
},
{
"debug/policy_chosen_logits": -3.439246416091919,
"debug/policy_chosen_logps": -161.62721252441406,
"debug/policy_rejected_logits": -2.9180073738098145,
"debug/policy_rejected_logps": -201.15945434570312,
"debug/reference_chosen_logps": -197.59341430664062,
"debug/reference_rejected_logps": -162.81692504882812,
"epoch": 0.5686274509803921,
"grad_norm": 2.4188165120482044,
"learning_rate": 1e-06,
"logits/chosen": -3.439246416091919,
"logits/rejected": -2.9180073738098145,
"logps/chosen": -161.62721252441406,
"logps/rejected": -201.15945434570312,
"loss": 0.1577,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.35966184735298157,
"rewards/margins": 0.7430870532989502,
"rewards/rejected": -0.38342520594596863,
"step": 29
},
{
"debug/policy_chosen_logits": -3.3921971321105957,
"debug/policy_chosen_logps": -167.86477661132812,
"debug/policy_rejected_logits": -3.1083662509918213,
"debug/policy_rejected_logps": -186.5076141357422,
"debug/reference_chosen_logps": -202.75399780273438,
"debug/reference_rejected_logps": -165.93167114257812,
"epoch": 0.5882352941176471,
"grad_norm": 5.111153930194959,
"learning_rate": 1e-06,
"logits/chosen": -3.3921971321105957,
"logits/rejected": -3.1083662509918213,
"logps/chosen": -167.86477661132812,
"logps/rejected": -186.5076141357422,
"loss": 0.2254,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3488922417163849,
"rewards/margins": 0.5546516180038452,
"rewards/rejected": -0.20575937628746033,
"step": 30
},
{
"debug/policy_chosen_logits": -3.296841621398926,
"debug/policy_chosen_logps": -186.3404541015625,
"debug/policy_rejected_logits": -3.0427982807159424,
"debug/policy_rejected_logps": -189.97247314453125,
"debug/reference_chosen_logps": -225.02894592285156,
"debug/reference_rejected_logps": -176.0540771484375,
"epoch": 0.6078431372549019,
"grad_norm": 5.753832158358882,
"learning_rate": 1e-06,
"logits/chosen": -3.296841621398926,
"logits/rejected": -3.0427982807159424,
"logps/chosen": -186.3404541015625,
"logps/rejected": -189.97247314453125,
"loss": 0.2514,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3868849575519562,
"rewards/margins": 0.5260686874389648,
"rewards/rejected": -0.13918372988700867,
"step": 31
},
{
"debug/policy_chosen_logits": -3.241013765335083,
"debug/policy_chosen_logps": -183.28062438964844,
"debug/policy_rejected_logits": -3.004754066467285,
"debug/policy_rejected_logps": -206.055419921875,
"debug/reference_chosen_logps": -215.31048583984375,
"debug/reference_rejected_logps": -162.4297637939453,
"epoch": 0.6274509803921569,
"grad_norm": 2.9239290358904837,
"learning_rate": 1e-06,
"logits/chosen": -3.241013765335083,
"logits/rejected": -3.004754066467285,
"logps/chosen": -183.28062438964844,
"logps/rejected": -206.055419921875,
"loss": 0.1757,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.32029855251312256,
"rewards/margins": 0.7565551996231079,
"rewards/rejected": -0.43625661730766296,
"step": 32
},
{
"debug/policy_chosen_logits": -3.3382985591888428,
"debug/policy_chosen_logps": -180.47259521484375,
"debug/policy_rejected_logits": -3.019533395767212,
"debug/policy_rejected_logps": -206.90989685058594,
"debug/reference_chosen_logps": -214.73683166503906,
"debug/reference_rejected_logps": -155.3048095703125,
"epoch": 0.6470588235294118,
"grad_norm": 3.2393949291557003,
"learning_rate": 1e-06,
"logits/chosen": -3.3382985591888428,
"logits/rejected": -3.019533395767212,
"logps/chosen": -180.47259521484375,
"logps/rejected": -206.90989685058594,
"loss": 0.2186,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3426423668861389,
"rewards/margins": 0.8586931228637695,
"rewards/rejected": -0.5160508155822754,
"step": 33
},
{
"debug/policy_chosen_logits": -3.25339412689209,
"debug/policy_chosen_logps": -177.0988006591797,
"debug/policy_rejected_logits": -2.986337900161743,
"debug/policy_rejected_logps": -201.41714477539062,
"debug/reference_chosen_logps": -204.36630249023438,
"debug/reference_rejected_logps": -173.47714233398438,
"epoch": 0.6666666666666666,
"grad_norm": 4.241231231386345,
"learning_rate": 1e-06,
"logits/chosen": -3.25339412689209,
"logits/rejected": -2.986337900161743,
"logps/chosen": -177.0988006591797,
"logps/rejected": -201.41714477539062,
"loss": 0.1937,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2726749777793884,
"rewards/margins": 0.5520750284194946,
"rewards/rejected": -0.2794000804424286,
"step": 34
},
{
"debug/policy_chosen_logits": -3.251969337463379,
"debug/policy_chosen_logps": -190.11709594726562,
"debug/policy_rejected_logits": -3.0741655826568604,
"debug/policy_rejected_logps": -194.2032470703125,
"debug/reference_chosen_logps": -217.17962646484375,
"debug/reference_rejected_logps": -160.67848205566406,
"epoch": 0.6862745098039216,
"grad_norm": 4.744228039712321,
"learning_rate": 1e-06,
"logits/chosen": -3.251969337463379,
"logits/rejected": -3.0741655826568604,
"logps/chosen": -190.11709594726562,
"logps/rejected": -194.2032470703125,
"loss": 0.1497,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2706252336502075,
"rewards/margins": 0.6058727502822876,
"rewards/rejected": -0.3352475166320801,
"step": 35
},
{
"debug/policy_chosen_logits": -3.3483757972717285,
"debug/policy_chosen_logps": -179.4599609375,
"debug/policy_rejected_logits": -3.1370773315429688,
"debug/policy_rejected_logps": -190.57760620117188,
"debug/reference_chosen_logps": -211.63006591796875,
"debug/reference_rejected_logps": -169.59912109375,
"epoch": 0.7058823529411765,
"grad_norm": 3.186431463787634,
"learning_rate": 1e-06,
"logits/chosen": -3.3483757972717285,
"logits/rejected": -3.1370773315429688,
"logps/chosen": -179.4599609375,
"logps/rejected": -190.57760620117188,
"loss": 0.2442,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.32170116901397705,
"rewards/margins": 0.531485915184021,
"rewards/rejected": -0.20978482067584991,
"step": 36
},
{
"debug/policy_chosen_logits": -3.2231926918029785,
"debug/policy_chosen_logps": -172.751220703125,
"debug/policy_rejected_logits": -3.042171001434326,
"debug/policy_rejected_logps": -192.6194610595703,
"debug/reference_chosen_logps": -204.07984924316406,
"debug/reference_rejected_logps": -162.96307373046875,
"epoch": 0.7254901960784313,
"grad_norm": 3.76917946103672,
"learning_rate": 1e-06,
"logits/chosen": -3.2231926918029785,
"logits/rejected": -3.042171001434326,
"logps/chosen": -172.751220703125,
"logps/rejected": -192.6194610595703,
"loss": 0.222,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31328636407852173,
"rewards/margins": 0.609850287437439,
"rewards/rejected": -0.29656392335891724,
"step": 37
},
{
"debug/policy_chosen_logits": -3.370556116104126,
"debug/policy_chosen_logps": -179.37933349609375,
"debug/policy_rejected_logits": -2.9038238525390625,
"debug/policy_rejected_logps": -197.517333984375,
"debug/reference_chosen_logps": -211.8248291015625,
"debug/reference_rejected_logps": -159.6433563232422,
"epoch": 0.7450980392156863,
"grad_norm": 3.7765583713304034,
"learning_rate": 1e-06,
"logits/chosen": -3.370556116104126,
"logits/rejected": -2.9038238525390625,
"logps/chosen": -179.37933349609375,
"logps/rejected": -197.517333984375,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3244548439979553,
"rewards/margins": 0.7031944990158081,
"rewards/rejected": -0.3787396550178528,
"step": 38
},
{
"debug/policy_chosen_logits": -3.253465414047241,
"debug/policy_chosen_logps": -188.2485809326172,
"debug/policy_rejected_logits": -3.051933526992798,
"debug/policy_rejected_logps": -186.41270446777344,
"debug/reference_chosen_logps": -218.99688720703125,
"debug/reference_rejected_logps": -167.86203002929688,
"epoch": 0.7647058823529411,
"grad_norm": 3.533915411617125,
"learning_rate": 1e-06,
"logits/chosen": -3.253465414047241,
"logits/rejected": -3.051933526992798,
"logps/chosen": -188.2485809326172,
"logps/rejected": -186.41270446777344,
"loss": 0.2102,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3074829876422882,
"rewards/margins": 0.492989718914032,
"rewards/rejected": -0.18550674617290497,
"step": 39
},
{
"debug/policy_chosen_logits": -3.397189140319824,
"debug/policy_chosen_logps": -184.84124755859375,
"debug/policy_rejected_logits": -2.951024055480957,
"debug/policy_rejected_logps": -205.2071533203125,
"debug/reference_chosen_logps": -216.28897094726562,
"debug/reference_rejected_logps": -167.5351104736328,
"epoch": 0.7843137254901961,
"grad_norm": 4.343119675678065,
"learning_rate": 1e-06,
"logits/chosen": -3.397189140319824,
"logits/rejected": -2.951024055480957,
"logps/chosen": -184.84124755859375,
"logps/rejected": -205.2071533203125,
"loss": 0.1484,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3144773840904236,
"rewards/margins": 0.6911977529525757,
"rewards/rejected": -0.3767203688621521,
"step": 40
},
{
"debug/policy_chosen_logits": -3.4680333137512207,
"debug/policy_chosen_logps": -174.45204162597656,
"debug/policy_rejected_logits": -3.177816867828369,
"debug/policy_rejected_logps": -200.5006103515625,
"debug/reference_chosen_logps": -201.8401641845703,
"debug/reference_rejected_logps": -168.33837890625,
"epoch": 0.803921568627451,
"grad_norm": 4.7398844072134,
"learning_rate": 1e-06,
"logits/chosen": -3.4680333137512207,
"logits/rejected": -3.177816867828369,
"logps/chosen": -174.45204162597656,
"logps/rejected": -200.5006103515625,
"loss": 0.1898,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.273881196975708,
"rewards/margins": 0.595503568649292,
"rewards/rejected": -0.3216223418712616,
"step": 41
},
{
"debug/policy_chosen_logits": -3.3945956230163574,
"debug/policy_chosen_logps": -172.79193115234375,
"debug/policy_rejected_logits": -3.0443050861358643,
"debug/policy_rejected_logps": -194.8880615234375,
"debug/reference_chosen_logps": -198.82882690429688,
"debug/reference_rejected_logps": -170.7233123779297,
"epoch": 0.8235294117647058,
"grad_norm": 5.166045026790051,
"learning_rate": 1e-06,
"logits/chosen": -3.3945956230163574,
"logits/rejected": -3.0443050861358643,
"logps/chosen": -172.79193115234375,
"logps/rejected": -194.8880615234375,
"loss": 0.1759,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.26036909222602844,
"rewards/margins": 0.5020167827606201,
"rewards/rejected": -0.24164767563343048,
"step": 42
},
{
"debug/policy_chosen_logits": -3.366682291030884,
"debug/policy_chosen_logps": -174.59375,
"debug/policy_rejected_logits": -3.141162157058716,
"debug/policy_rejected_logps": -200.96633911132812,
"debug/reference_chosen_logps": -205.5618438720703,
"debug/reference_rejected_logps": -192.6947021484375,
"epoch": 0.8431372549019608,
"grad_norm": 3.316025693224064,
"learning_rate": 1e-06,
"logits/chosen": -3.366682291030884,
"logits/rejected": -3.141162157058716,
"logps/chosen": -174.59375,
"logps/rejected": -200.96633911132812,
"loss": 0.1598,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3096809387207031,
"rewards/margins": 0.3923972249031067,
"rewards/rejected": -0.08271628618240356,
"step": 43
},
{
"debug/policy_chosen_logits": -3.4701032638549805,
"debug/policy_chosen_logps": -172.69961547851562,
"debug/policy_rejected_logits": -3.0816352367401123,
"debug/policy_rejected_logps": -192.14527893066406,
"debug/reference_chosen_logps": -203.4981231689453,
"debug/reference_rejected_logps": -165.00967407226562,
"epoch": 0.8627450980392157,
"grad_norm": 5.732457245614043,
"learning_rate": 1e-06,
"logits/chosen": -3.4701032638549805,
"logits/rejected": -3.0816352367401123,
"logps/chosen": -172.69961547851562,
"logps/rejected": -192.14527893066406,
"loss": 0.2551,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.30798494815826416,
"rewards/margins": 0.5793408155441284,
"rewards/rejected": -0.27135586738586426,
"step": 44
},
{
"debug/policy_chosen_logits": -3.3291234970092773,
"debug/policy_chosen_logps": -170.3388671875,
"debug/policy_rejected_logits": -2.937793493270874,
"debug/policy_rejected_logps": -192.89581298828125,
"debug/reference_chosen_logps": -202.19802856445312,
"debug/reference_rejected_logps": -159.46893310546875,
"epoch": 0.8823529411764706,
"grad_norm": 5.771805579285295,
"learning_rate": 1e-06,
"logits/chosen": -3.3291234970092773,
"logits/rejected": -2.937793493270874,
"logps/chosen": -170.3388671875,
"logps/rejected": -192.89581298828125,
"loss": 0.2151,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.31859153509140015,
"rewards/margins": 0.6528602242469788,
"rewards/rejected": -0.334268718957901,
"step": 45
},
{
"debug/policy_chosen_logits": -3.226428270339966,
"debug/policy_chosen_logps": -187.73739624023438,
"debug/policy_rejected_logits": -2.986654043197632,
"debug/policy_rejected_logps": -192.33885192871094,
"debug/reference_chosen_logps": -220.32342529296875,
"debug/reference_rejected_logps": -167.2851104736328,
"epoch": 0.9019607843137255,
"grad_norm": 2.8823875670773695,
"learning_rate": 1e-06,
"logits/chosen": -3.226428270339966,
"logits/rejected": -2.986654043197632,
"logps/chosen": -187.73739624023438,
"logps/rejected": -192.33885192871094,
"loss": 0.1605,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.32586026191711426,
"rewards/margins": 0.5763977766036987,
"rewards/rejected": -0.2505374550819397,
"step": 46
},
{
"debug/policy_chosen_logits": -3.5862808227539062,
"debug/policy_chosen_logps": -182.93630981445312,
"debug/policy_rejected_logits": -3.0501527786254883,
"debug/policy_rejected_logps": -197.54100036621094,
"debug/reference_chosen_logps": -213.4707794189453,
"debug/reference_rejected_logps": -172.90322875976562,
"epoch": 0.9215686274509803,
"grad_norm": 2.720609192674643,
"learning_rate": 1e-06,
"logits/chosen": -3.5862808227539062,
"logits/rejected": -3.0501527786254883,
"logps/chosen": -182.93630981445312,
"logps/rejected": -197.54100036621094,
"loss": 0.1686,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30534470081329346,
"rewards/margins": 0.5517222881317139,
"rewards/rejected": -0.2463776171207428,
"step": 47
},
{
"debug/policy_chosen_logits": -3.3840320110321045,
"debug/policy_chosen_logps": -175.84884643554688,
"debug/policy_rejected_logits": -3.043017625808716,
"debug/policy_rejected_logps": -202.57257080078125,
"debug/reference_chosen_logps": -203.88320922851562,
"debug/reference_rejected_logps": -172.34735107421875,
"epoch": 0.9411764705882353,
"grad_norm": 2.309188784503618,
"learning_rate": 1e-06,
"logits/chosen": -3.3840320110321045,
"logits/rejected": -3.043017625808716,
"logps/chosen": -175.84884643554688,
"logps/rejected": -202.57257080078125,
"loss": 0.1666,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.28034350275993347,
"rewards/margins": 0.5825955271720886,
"rewards/rejected": -0.30225205421447754,
"step": 48
},
{
"debug/policy_chosen_logits": -3.3747286796569824,
"debug/policy_chosen_logps": -185.90341186523438,
"debug/policy_rejected_logits": -3.0797741413116455,
"debug/policy_rejected_logps": -202.92413330078125,
"debug/reference_chosen_logps": -221.17340087890625,
"debug/reference_rejected_logps": -154.19923400878906,
"epoch": 0.9607843137254902,
"grad_norm": 2.7307344297698157,
"learning_rate": 1e-06,
"logits/chosen": -3.3747286796569824,
"logits/rejected": -3.0797741413116455,
"logps/chosen": -185.90341186523438,
"logps/rejected": -202.92413330078125,
"loss": 0.1461,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3527000844478607,
"rewards/margins": 0.8399491310119629,
"rewards/rejected": -0.48724907636642456,
"step": 49
},
{
"debug/policy_chosen_logits": -3.4781532287597656,
"debug/policy_chosen_logps": -177.86842346191406,
"debug/policy_rejected_logits": -3.0594165325164795,
"debug/policy_rejected_logps": -195.89004516601562,
"debug/reference_chosen_logps": -203.58407592773438,
"debug/reference_rejected_logps": -161.39697265625,
"epoch": 0.9803921568627451,
"grad_norm": 3.3901490209041247,
"learning_rate": 1e-06,
"logits/chosen": -3.4781532287597656,
"logits/rejected": -3.0594165325164795,
"logps/chosen": -177.86842346191406,
"logps/rejected": -195.89004516601562,
"loss": 0.1895,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2571565508842468,
"rewards/margins": 0.6020870208740234,
"rewards/rejected": -0.3449305295944214,
"step": 50
},
{
"debug/policy_chosen_logits": -3.4821436405181885,
"debug/policy_chosen_logps": -177.99000549316406,
"debug/policy_rejected_logits": -3.0359578132629395,
"debug/policy_rejected_logps": -203.94029235839844,
"debug/reference_chosen_logps": -212.01348876953125,
"debug/reference_rejected_logps": -166.06936645507812,
"epoch": 1.0,
"grad_norm": 3.0238129167739807,
"learning_rate": 1e-06,
"logits/chosen": -3.4821436405181885,
"logits/rejected": -3.0359578132629395,
"logps/chosen": -177.99000549316406,
"logps/rejected": -203.94029235839844,
"loss": 0.1383,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3402349352836609,
"rewards/margins": 0.7189440727233887,
"rewards/rejected": -0.37870916724205017,
"step": 51
},
{
"epoch": 1.0,
"step": 51,
"total_flos": 0.0,
"train_loss": 0.26999635322421206,
"train_runtime": 166.8574,
"train_samples_per_second": 19.31,
"train_steps_per_second": 0.306
}
],
"logging_steps": 1,
"max_steps": 51,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}