{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 31, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -3.141004800796509, "debug/policy_chosen_logps": -179.27029418945312, "debug/policy_rejected_logits": -3.201063394546509, "debug/policy_rejected_logps": -197.1785125732422, "debug/reference_chosen_logps": -179.27029418945312, "debug/reference_rejected_logps": -197.1785125732422, "epoch": 0.03225806451612903, "grad_norm": 9.90184688013874, "learning_rate": 1e-06, "logits/chosen": -3.141004800796509, "logits/rejected": -3.201063394546509, "logps/chosen": -179.27029418945312, "logps/rejected": -197.1785125732422, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -3.175147771835327, "debug/policy_chosen_logps": -170.20516967773438, "debug/policy_rejected_logits": -3.1672799587249756, "debug/policy_rejected_logps": -170.16004943847656, "debug/reference_chosen_logps": -172.45477294921875, "debug/reference_rejected_logps": -170.80857849121094, "epoch": 0.06451612903225806, "grad_norm": 12.759093734337377, "learning_rate": 1e-06, "logits/chosen": -3.175147771835327, "logits/rejected": -3.1672799587249756, "logps/chosen": -170.20516967773438, "logps/rejected": -170.16004943847656, "loss": 0.492, "rewards/accuracies": 0.5, "rewards/chosen": 0.02249586209654808, "rewards/margins": 0.016010627150535583, "rewards/rejected": 0.006485233083367348, "step": 2 }, { "debug/policy_chosen_logits": -3.182211399078369, "debug/policy_chosen_logps": -137.9634246826172, "debug/policy_rejected_logits": -3.2010762691497803, "debug/policy_rejected_logps": -186.76589965820312, "debug/reference_chosen_logps": -139.10812377929688, "debug/reference_rejected_logps": -189.68460083007812, "epoch": 0.0967741935483871, "grad_norm": 39.234700405953184, "learning_rate": 1e-06, "logits/chosen": -3.182211399078369, "logits/rejected": -3.2010762691497803, "logps/chosen": -137.9634246826172, "logps/rejected": -186.76589965820312, "loss": 0.5175, "rewards/accuracies": 0.375, "rewards/chosen": 0.011446855962276459, "rewards/margins": -0.017740031704306602, "rewards/rejected": 0.02918688766658306, "step": 3 }, { "debug/policy_chosen_logits": -3.3540351390838623, "debug/policy_chosen_logps": -164.7408447265625, "debug/policy_rejected_logits": -3.372749090194702, "debug/policy_rejected_logps": -192.99838256835938, "debug/reference_chosen_logps": -167.7454833984375, "debug/reference_rejected_logps": -194.29327392578125, "epoch": 0.12903225806451613, "grad_norm": 14.024887806474224, "learning_rate": 1e-06, "logits/chosen": -3.3540351390838623, "logits/rejected": -3.372749090194702, "logps/chosen": -164.7408447265625, "logps/rejected": -192.99838256835938, "loss": 0.5026, "rewards/accuracies": 0.5, "rewards/chosen": 0.03004644252359867, "rewards/margins": 0.017097529023885727, "rewards/rejected": 0.012948913499712944, "step": 4 }, { "debug/policy_chosen_logits": -3.0800650119781494, "debug/policy_chosen_logps": -183.02537536621094, "debug/policy_rejected_logits": -3.0585241317749023, "debug/policy_rejected_logps": -228.36184692382812, "debug/reference_chosen_logps": -183.33753967285156, "debug/reference_rejected_logps": -228.81768798828125, "epoch": 0.16129032258064516, "grad_norm": 32.252102984713154, "learning_rate": 1e-06, "logits/chosen": -3.0800650119781494, "logits/rejected": -3.0585241317749023, "logps/chosen": -183.02537536621094, "logps/rejected": -228.36184692382812, "loss": 0.4924, "rewards/accuracies": 0.25, "rewards/chosen": 0.0031216423958539963, "rewards/margins": -0.0014367960393428802, "rewards/rejected": 0.004558438900858164, "step": 5 }, { "debug/policy_chosen_logits": -3.224591016769409, "debug/policy_chosen_logps": -167.71742248535156, "debug/policy_rejected_logits": -3.289069175720215, "debug/policy_rejected_logps": -221.51846313476562, "debug/reference_chosen_logps": -169.82650756835938, "debug/reference_rejected_logps": -219.76731872558594, "epoch": 0.1935483870967742, "grad_norm": 11.2418551066032, "learning_rate": 1e-06, "logits/chosen": -3.224591016769409, "logits/rejected": -3.289069175720215, "logps/chosen": -167.71742248535156, "logps/rejected": -221.51846313476562, "loss": 0.4992, "rewards/accuracies": 0.875, "rewards/chosen": 0.021090947091579437, "rewards/margins": 0.038602352142333984, "rewards/rejected": -0.017511405050754547, "step": 6 }, { "debug/policy_chosen_logits": -3.296588182449341, "debug/policy_chosen_logps": -149.22991943359375, "debug/policy_rejected_logits": -3.2666971683502197, "debug/policy_rejected_logps": -164.7155303955078, "debug/reference_chosen_logps": -147.17678833007812, "debug/reference_rejected_logps": -163.22897338867188, "epoch": 0.22580645161290322, "grad_norm": 26.842712029286442, "learning_rate": 1e-06, "logits/chosen": -3.296588182449341, "logits/rejected": -3.2666971683502197, "logps/chosen": -149.22991943359375, "logps/rejected": -164.7155303955078, "loss": 0.4918, "rewards/accuracies": 0.875, "rewards/chosen": -0.020531119778752327, "rewards/margins": -0.0056656356900930405, "rewards/rejected": -0.014865485019981861, "step": 7 }, { "debug/policy_chosen_logits": -3.1323249340057373, "debug/policy_chosen_logps": -151.85377502441406, "debug/policy_rejected_logits": -3.1660234928131104, "debug/policy_rejected_logps": -176.71502685546875, "debug/reference_chosen_logps": -153.5457000732422, "debug/reference_rejected_logps": -177.84255981445312, "epoch": 0.25806451612903225, "grad_norm": 24.498644294188434, "learning_rate": 1e-06, "logits/chosen": -3.1323249340057373, "logits/rejected": -3.1660234928131104, "logps/chosen": -151.85377502441406, "logps/rejected": -176.71502685546875, "loss": 0.4997, "rewards/accuracies": 0.75, "rewards/chosen": 0.016919326037168503, "rewards/margins": 0.005644083023071289, "rewards/rejected": 0.011275244876742363, "step": 8 }, { "debug/policy_chosen_logits": -3.202829599380493, "debug/policy_chosen_logps": -156.52719116210938, "debug/policy_rejected_logits": -3.2741198539733887, "debug/policy_rejected_logps": -178.1981201171875, "debug/reference_chosen_logps": -165.70819091796875, "debug/reference_rejected_logps": -176.71347045898438, "epoch": 0.2903225806451613, "grad_norm": 21.38108522550141, "learning_rate": 1e-06, "logits/chosen": -3.202829599380493, "logits/rejected": -3.2741198539733887, "logps/chosen": -156.52719116210938, "logps/rejected": -178.1981201171875, "loss": 0.4911, "rewards/accuracies": 0.75, "rewards/chosen": 0.0918099582195282, "rewards/margins": 0.10665654391050339, "rewards/rejected": -0.014846592210233212, "step": 9 }, { "debug/policy_chosen_logits": -3.072221040725708, "debug/policy_chosen_logps": -158.25836181640625, "debug/policy_rejected_logits": -3.057267427444458, "debug/policy_rejected_logps": -192.45242309570312, "debug/reference_chosen_logps": -164.49575805664062, "debug/reference_rejected_logps": -190.25021362304688, "epoch": 0.3225806451612903, "grad_norm": 18.969223507012597, "learning_rate": 1e-06, "logits/chosen": -3.072221040725708, "logits/rejected": -3.057267427444458, "logps/chosen": -158.25836181640625, "logps/rejected": -192.45242309570312, "loss": 0.5235, "rewards/accuracies": 0.875, "rewards/chosen": 0.06237373501062393, "rewards/margins": 0.08439576625823975, "rewards/rejected": -0.022022036835551262, "step": 10 }, { "debug/policy_chosen_logits": -3.092670440673828, "debug/policy_chosen_logps": -161.42913818359375, "debug/policy_rejected_logits": -3.159600257873535, "debug/policy_rejected_logps": -218.52291870117188, "debug/reference_chosen_logps": -160.97142028808594, "debug/reference_rejected_logps": -214.56564331054688, "epoch": 0.3548387096774194, "grad_norm": 13.28904793895601, "learning_rate": 1e-06, "logits/chosen": -3.092670440673828, "logits/rejected": -3.159600257873535, "logps/chosen": -161.42913818359375, "logps/rejected": -218.52291870117188, "loss": 0.4986, "rewards/accuracies": 0.5, "rewards/chosen": -0.004577284678816795, "rewards/margins": 0.03499564155936241, "rewards/rejected": -0.03957292437553406, "step": 11 }, { "debug/policy_chosen_logits": -3.211040735244751, "debug/policy_chosen_logps": -191.30038452148438, "debug/policy_rejected_logits": -3.1906237602233887, "debug/policy_rejected_logps": -196.66650390625, "debug/reference_chosen_logps": -189.11093139648438, "debug/reference_rejected_logps": -193.2270965576172, "epoch": 0.3870967741935484, "grad_norm": 24.159637455477917, "learning_rate": 1e-06, "logits/chosen": -3.211040735244751, "logits/rejected": -3.1906237602233887, "logps/chosen": -191.30038452148438, "logps/rejected": -196.66650390625, "loss": 0.5118, "rewards/accuracies": 0.5, "rewards/chosen": -0.02189437672495842, "rewards/margins": 0.012499636970460415, "rewards/rejected": -0.03439401462674141, "step": 12 }, { "debug/policy_chosen_logits": -3.3043720722198486, "debug/policy_chosen_logps": -161.9444580078125, "debug/policy_rejected_logits": -3.3002195358276367, "debug/policy_rejected_logps": -193.20816040039062, "debug/reference_chosen_logps": -157.93389892578125, "debug/reference_rejected_logps": -187.40017700195312, "epoch": 0.41935483870967744, "grad_norm": 18.263800856216903, "learning_rate": 1e-06, "logits/chosen": -3.3043720722198486, "logits/rejected": -3.3002195358276367, "logps/chosen": -161.9444580078125, "logps/rejected": -193.20816040039062, "loss": 0.4985, "rewards/accuracies": 0.625, "rewards/chosen": -0.0401054248213768, "rewards/margins": 0.017974376678466797, "rewards/rejected": -0.058079805225133896, "step": 13 }, { "debug/policy_chosen_logits": -3.25443696975708, "debug/policy_chosen_logps": -192.36001586914062, "debug/policy_rejected_logits": -3.1161751747131348, "debug/policy_rejected_logps": -188.52992248535156, "debug/reference_chosen_logps": -186.53701782226562, "debug/reference_rejected_logps": -183.0063934326172, "epoch": 0.45161290322580644, "grad_norm": 17.08256681847481, "learning_rate": 1e-06, "logits/chosen": -3.25443696975708, "logits/rejected": -3.1161751747131348, "logps/chosen": -192.36001586914062, "logps/rejected": -188.52992248535156, "loss": 0.4921, "rewards/accuracies": 0.25, "rewards/chosen": -0.05822989344596863, "rewards/margins": -0.002994718961417675, "rewards/rejected": -0.05523517355322838, "step": 14 }, { "debug/policy_chosen_logits": -3.351620674133301, "debug/policy_chosen_logps": -161.11260986328125, "debug/policy_rejected_logits": -3.3511033058166504, "debug/policy_rejected_logps": -182.30551147460938, "debug/reference_chosen_logps": -156.1150665283203, "debug/reference_rejected_logps": -177.539794921875, "epoch": 0.4838709677419355, "grad_norm": 8.229874549490967, "learning_rate": 1e-06, "logits/chosen": -3.351620674133301, "logits/rejected": -3.3511033058166504, "logps/chosen": -161.11260986328125, "logps/rejected": -182.30551147460938, "loss": 0.4926, "rewards/accuracies": 0.5, "rewards/chosen": -0.04997550696134567, "rewards/margins": -0.00231829471886158, "rewards/rejected": -0.04765721410512924, "step": 15 }, { "debug/policy_chosen_logits": -3.0169596672058105, "debug/policy_chosen_logps": -177.72250366210938, "debug/policy_rejected_logits": -3.0592868328094482, "debug/policy_rejected_logps": -202.39126586914062, "debug/reference_chosen_logps": -175.81646728515625, "debug/reference_rejected_logps": -196.47059631347656, "epoch": 0.5161290322580645, "grad_norm": 18.547449632791665, "learning_rate": 1e-06, "logits/chosen": -3.0169596672058105, "logits/rejected": -3.0592868328094482, "logps/chosen": -177.72250366210938, "logps/rejected": -202.39126586914062, "loss": 0.4991, "rewards/accuracies": 0.875, "rewards/chosen": -0.01906036213040352, "rewards/margins": 0.04014638811349869, "rewards/rejected": -0.059206753969192505, "step": 16 }, { "debug/policy_chosen_logits": -3.0885531902313232, "debug/policy_chosen_logps": -181.26922607421875, "debug/policy_rejected_logits": -3.1282451152801514, "debug/policy_rejected_logps": -192.98558044433594, "debug/reference_chosen_logps": -179.6282958984375, "debug/reference_rejected_logps": -190.09738159179688, "epoch": 0.5483870967741935, "grad_norm": 16.396288041866185, "learning_rate": 1e-06, "logits/chosen": -3.0885531902313232, "logits/rejected": -3.1282451152801514, "logps/chosen": -181.26922607421875, "logps/rejected": -192.98558044433594, "loss": 0.4811, "rewards/accuracies": 0.375, "rewards/chosen": -0.0164091307669878, "rewards/margins": 0.01247288566082716, "rewards/rejected": -0.028882015496492386, "step": 17 }, { "debug/policy_chosen_logits": -3.1581532955169678, "debug/policy_chosen_logps": -159.01693725585938, "debug/policy_rejected_logits": -3.2684059143066406, "debug/policy_rejected_logps": -179.94430541992188, "debug/reference_chosen_logps": -159.53839111328125, "debug/reference_rejected_logps": -175.76220703125, "epoch": 0.5806451612903226, "grad_norm": 15.55451033065194, "learning_rate": 1e-06, "logits/chosen": -3.1581532955169678, "logits/rejected": -3.2684059143066406, "logps/chosen": -159.01693725585938, "logps/rejected": -179.94430541992188, "loss": 0.4987, "rewards/accuracies": 1.0, "rewards/chosen": 0.005214462522417307, "rewards/margins": 0.04703548550605774, "rewards/rejected": -0.04182102158665657, "step": 18 }, { "debug/policy_chosen_logits": -3.1134331226348877, "debug/policy_chosen_logps": -139.32861328125, "debug/policy_rejected_logits": -3.094226837158203, "debug/policy_rejected_logps": -206.40151977539062, "debug/reference_chosen_logps": -141.53060913085938, "debug/reference_rejected_logps": -203.99859619140625, "epoch": 0.6129032258064516, "grad_norm": 11.669033627292357, "learning_rate": 1e-06, "logits/chosen": -3.1134331226348877, "logits/rejected": -3.094226837158203, "logps/chosen": -139.32861328125, "logps/rejected": -206.40151977539062, "loss": 0.4711, "rewards/accuracies": 0.625, "rewards/chosen": 0.02201991155743599, "rewards/margins": 0.046049244701862335, "rewards/rejected": -0.024029331281781197, "step": 19 }, { "debug/policy_chosen_logits": -3.1212384700775146, "debug/policy_chosen_logps": -191.66444396972656, "debug/policy_rejected_logits": -3.260685920715332, "debug/policy_rejected_logps": -187.78424072265625, "debug/reference_chosen_logps": -191.69525146484375, "debug/reference_rejected_logps": -184.98468017578125, "epoch": 0.6451612903225806, "grad_norm": 25.191796111527317, "learning_rate": 1e-06, "logits/chosen": -3.1212384700775146, "logits/rejected": -3.260685920715332, "logps/chosen": -191.66444396972656, "logps/rejected": -187.78424072265625, "loss": 0.4729, "rewards/accuracies": 0.875, "rewards/chosen": 0.000308074988424778, "rewards/margins": 0.028303585946559906, "rewards/rejected": -0.027995511889457703, "step": 20 }, { "debug/policy_chosen_logits": -3.119765520095825, "debug/policy_chosen_logps": -155.2023162841797, "debug/policy_rejected_logits": -3.102405071258545, "debug/policy_rejected_logps": -227.47677612304688, "debug/reference_chosen_logps": -159.71484375, "debug/reference_rejected_logps": -224.30484008789062, "epoch": 0.6774193548387096, "grad_norm": 29.084969849970683, "learning_rate": 1e-06, "logits/chosen": -3.119765520095825, "logits/rejected": -3.102405071258545, "logps/chosen": -155.2023162841797, "logps/rejected": -227.47677612304688, "loss": 0.4949, "rewards/accuracies": 0.875, "rewards/chosen": 0.04512513801455498, "rewards/margins": 0.07684443891048431, "rewards/rejected": -0.031719304621219635, "step": 21 }, { "debug/policy_chosen_logits": -3.1689579486846924, "debug/policy_chosen_logps": -183.11013793945312, "debug/policy_rejected_logits": -3.2236738204956055, "debug/policy_rejected_logps": -208.21568298339844, "debug/reference_chosen_logps": -185.74827575683594, "debug/reference_rejected_logps": -208.32473754882812, "epoch": 0.7096774193548387, "grad_norm": 30.856887681162963, "learning_rate": 1e-06, "logits/chosen": -3.1689579486846924, "logits/rejected": -3.2236738204956055, "logps/chosen": -183.11013793945312, "logps/rejected": -208.21568298339844, "loss": 0.4693, "rewards/accuracies": 0.75, "rewards/chosen": 0.026381436735391617, "rewards/margins": 0.025290966033935547, "rewards/rejected": 0.00109047070145607, "step": 22 }, { "debug/policy_chosen_logits": -3.2187812328338623, "debug/policy_chosen_logps": -149.282470703125, "debug/policy_rejected_logits": -3.192440986633301, "debug/policy_rejected_logps": -173.46737670898438, "debug/reference_chosen_logps": -152.2470245361328, "debug/reference_rejected_logps": -174.3741455078125, "epoch": 0.7419354838709677, "grad_norm": 7.948773470240676, "learning_rate": 1e-06, "logits/chosen": -3.2187812328338623, "logits/rejected": -3.192440986633301, "logps/chosen": -149.282470703125, "logps/rejected": -173.46737670898438, "loss": 0.4702, "rewards/accuracies": 0.5, "rewards/chosen": 0.02964554727077484, "rewards/margins": 0.02057783305644989, "rewards/rejected": 0.0090677160769701, "step": 23 }, { "debug/policy_chosen_logits": -3.232670783996582, "debug/policy_chosen_logps": -183.32423400878906, "debug/policy_rejected_logits": -3.289520025253296, "debug/policy_rejected_logps": -165.54977416992188, "debug/reference_chosen_logps": -180.52044677734375, "debug/reference_rejected_logps": -166.45697021484375, "epoch": 0.7741935483870968, "grad_norm": 19.88569884373009, "learning_rate": 1e-06, "logits/chosen": -3.232670783996582, "logits/rejected": -3.289520025253296, "logps/chosen": -183.32423400878906, "logps/rejected": -165.54977416992188, "loss": 0.4722, "rewards/accuracies": 0.375, "rewards/chosen": -0.028037691488862038, "rewards/margins": -0.037109650671482086, "rewards/rejected": 0.009071960113942623, "step": 24 }, { "debug/policy_chosen_logits": -3.147738456726074, "debug/policy_chosen_logps": -177.48394775390625, "debug/policy_rejected_logits": -3.245333433151245, "debug/policy_rejected_logps": -199.45582580566406, "debug/reference_chosen_logps": -175.49041748046875, "debug/reference_rejected_logps": -194.36691284179688, "epoch": 0.8064516129032258, "grad_norm": 28.982749008082763, "learning_rate": 1e-06, "logits/chosen": -3.147738456726074, "logits/rejected": -3.245333433151245, "logps/chosen": -177.48394775390625, "logps/rejected": -199.45582580566406, "loss": 0.4766, "rewards/accuracies": 0.625, "rewards/chosen": -0.01993529126048088, "rewards/margins": 0.030953893437981606, "rewards/rejected": -0.05088917911052704, "step": 25 }, { "debug/policy_chosen_logits": -3.2787725925445557, "debug/policy_chosen_logps": -133.00405883789062, "debug/policy_rejected_logits": -3.2454986572265625, "debug/policy_rejected_logps": -200.76132202148438, "debug/reference_chosen_logps": -135.29257202148438, "debug/reference_rejected_logps": -194.49354553222656, "epoch": 0.8387096774193549, "grad_norm": 12.049647968845019, "learning_rate": 1e-06, "logits/chosen": -3.2787725925445557, "logits/rejected": -3.2454986572265625, "logps/chosen": -133.00405883789062, "logps/rejected": -200.76132202148438, "loss": 0.4624, "rewards/accuracies": 0.75, "rewards/chosen": 0.022885091602802277, "rewards/margins": 0.08556278049945831, "rewards/rejected": -0.06267768144607544, "step": 26 }, { "debug/policy_chosen_logits": -3.0659728050231934, "debug/policy_chosen_logps": -146.81527709960938, "debug/policy_rejected_logits": -3.2430717945098877, "debug/policy_rejected_logps": -197.0322265625, "debug/reference_chosen_logps": -154.13369750976562, "debug/reference_rejected_logps": -196.10546875, "epoch": 0.8709677419354839, "grad_norm": 32.63477734447832, "learning_rate": 1e-06, "logits/chosen": -3.0659728050231934, "logits/rejected": -3.2430717945098877, "logps/chosen": -146.81527709960938, "logps/rejected": -197.0322265625, "loss": 0.4691, "rewards/accuracies": 0.75, "rewards/chosen": 0.07318423688411713, "rewards/margins": 0.08245191723108292, "rewards/rejected": -0.009267672896385193, "step": 27 }, { "debug/policy_chosen_logits": -3.2434401512145996, "debug/policy_chosen_logps": -149.3644256591797, "debug/policy_rejected_logits": -3.216662883758545, "debug/policy_rejected_logps": -192.85025024414062, "debug/reference_chosen_logps": -149.99478149414062, "debug/reference_rejected_logps": -189.23365783691406, "epoch": 0.9032258064516129, "grad_norm": 10.799436750148159, "learning_rate": 1e-06, "logits/chosen": -3.2434401512145996, "logits/rejected": -3.216662883758545, "logps/chosen": -149.3644256591797, "logps/rejected": -192.85025024414062, "loss": 0.47, "rewards/accuracies": 0.875, "rewards/chosen": 0.006303424946963787, "rewards/margins": 0.04246926307678223, "rewards/rejected": -0.036165837198495865, "step": 28 }, { "debug/policy_chosen_logits": -3.089280128479004, "debug/policy_chosen_logps": -214.14886474609375, "debug/policy_rejected_logits": -3.208700656890869, "debug/policy_rejected_logps": -192.32411193847656, "debug/reference_chosen_logps": -214.442138671875, "debug/reference_rejected_logps": -191.06576538085938, "epoch": 0.9354838709677419, "grad_norm": 17.0594378565657, "learning_rate": 1e-06, "logits/chosen": -3.089280128479004, "logits/rejected": -3.208700656890869, "logps/chosen": -214.14886474609375, "logps/rejected": -192.32411193847656, "loss": 0.4689, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029327012598514557, "rewards/margins": 0.015516050159931183, "rewards/rejected": -0.012583350762724876, "step": 29 }, { "debug/policy_chosen_logits": -3.2566771507263184, "debug/policy_chosen_logps": -147.951904296875, "debug/policy_rejected_logits": -3.314493417739868, "debug/policy_rejected_logps": -165.28897094726562, "debug/reference_chosen_logps": -153.832275390625, "debug/reference_rejected_logps": -167.7118377685547, "epoch": 0.967741935483871, "grad_norm": 10.478877864542989, "learning_rate": 1e-06, "logits/chosen": -3.2566771507263184, "logits/rejected": -3.314493417739868, "logps/chosen": -147.951904296875, "logps/rejected": -165.28897094726562, "loss": 0.4552, "rewards/accuracies": 0.875, "rewards/chosen": 0.05880369991064072, "rewards/margins": 0.034574974328279495, "rewards/rejected": 0.02422872558236122, "step": 30 }, { "debug/policy_chosen_logits": -3.231640338897705, "debug/policy_chosen_logps": -153.23507690429688, "debug/policy_rejected_logits": -3.283898115158081, "debug/policy_rejected_logps": -196.86080932617188, "debug/reference_chosen_logps": -165.0509796142578, "debug/reference_rejected_logps": -195.94383239746094, "epoch": 1.0, "grad_norm": 38.712551042753965, "learning_rate": 1e-06, "logits/chosen": -3.231640338897705, "logits/rejected": -3.283898115158081, "logps/chosen": -153.23507690429688, "logps/rejected": -196.86080932617188, "loss": 0.4584, "rewards/accuracies": 1.0, "rewards/chosen": 0.11815895140171051, "rewards/margins": 0.1273288130760193, "rewards/rejected": -0.009169863536953926, "step": 31 }, { "epoch": 1.0, "step": 31, "total_flos": 0.0, "train_loss": 0.48688735404322225, "train_runtime": 123.759, "train_samples_per_second": 15.894, "train_steps_per_second": 0.25 } ], "logging_steps": 1, "max_steps": 31, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }