{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 42, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -1.7926865816116333, "debug/policy_chosen_logps": -217.38279724121094, "debug/policy_rejected_logits": -1.8328309059143066, "debug/policy_rejected_logps": -225.83067321777344, "debug/reference_chosen_logps": -217.38279724121094, "debug/reference_rejected_logps": -225.83067321777344, "epoch": 0.023809523809523808, "grad_norm": 4.886963528314891, "learning_rate": 1e-06, "logits/chosen": -1.7926865816116333, "logits/rejected": -1.8328309059143066, "logps/chosen": -217.38279724121094, "logps/rejected": -225.83067321777344, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -1.8736802339553833, "debug/policy_chosen_logps": -237.7906036376953, "debug/policy_rejected_logits": -1.6878823041915894, "debug/policy_rejected_logps": -252.66542053222656, "debug/reference_chosen_logps": -237.90771484375, "debug/reference_rejected_logps": -252.53955078125, "epoch": 0.047619047619047616, "grad_norm": 4.425053566972872, "learning_rate": 1e-06, "logits/chosen": -1.8736802339553833, "logits/rejected": -1.6878823041915894, "logps/chosen": -237.7906036376953, "logps/rejected": -252.66542053222656, "loss": 0.4983, "rewards/accuracies": 0.5, "rewards/chosen": 0.001171283656731248, "rewards/margins": 0.0024298285134136677, "rewards/rejected": -0.001258544740267098, "step": 2 }, { "debug/policy_chosen_logits": -1.6082555055618286, "debug/policy_chosen_logps": -239.86978149414062, "debug/policy_rejected_logits": -1.6321099996566772, "debug/policy_rejected_logps": -217.70059204101562, "debug/reference_chosen_logps": -240.5919189453125, "debug/reference_rejected_logps": -218.04476928710938, "epoch": 0.07142857142857142, "grad_norm": 4.882938644343874, "learning_rate": 1e-06, "logits/chosen": -1.6082555055618286, "logits/rejected": -1.6321099996566772, "logps/chosen": -239.86978149414062, "logps/rejected": -217.70059204101562, "loss": 0.4997, "rewards/accuracies": 0.5, "rewards/chosen": 0.007221374660730362, "rewards/margins": 0.0037795070093125105, "rewards/rejected": 0.0034418676514178514, "step": 3 }, { "debug/policy_chosen_logits": -1.8915444612503052, "debug/policy_chosen_logps": -215.7807159423828, "debug/policy_rejected_logits": -1.7285552024841309, "debug/policy_rejected_logps": -219.4071044921875, "debug/reference_chosen_logps": -215.05958557128906, "debug/reference_rejected_logps": -219.14736938476562, "epoch": 0.09523809523809523, "grad_norm": 4.417539068018447, "learning_rate": 1e-06, "logits/chosen": -1.8915444612503052, "logits/rejected": -1.7285552024841309, "logps/chosen": -215.7807159423828, "logps/rejected": -219.4071044921875, "loss": 0.5018, "rewards/accuracies": 0.375, "rewards/chosen": -0.007211265154182911, "rewards/margins": -0.004613952711224556, "rewards/rejected": -0.0025973126757889986, "step": 4 }, { "debug/policy_chosen_logits": -1.8312164545059204, "debug/policy_chosen_logps": -228.4812469482422, "debug/policy_rejected_logits": -1.8293319940567017, "debug/policy_rejected_logps": -219.43865966796875, "debug/reference_chosen_logps": -228.2222137451172, "debug/reference_rejected_logps": -218.95147705078125, "epoch": 0.11904761904761904, "grad_norm": 4.499121781225099, "learning_rate": 1e-06, "logits/chosen": -1.8312164545059204, "logits/rejected": -1.8293319940567017, "logps/chosen": -228.4812469482422, "logps/rejected": -219.43865966796875, "loss": 0.4982, "rewards/accuracies": 0.625, "rewards/chosen": -0.0025903128553181887, "rewards/margins": 0.002281456021592021, "rewards/rejected": -0.00487176887691021, "step": 5 }, { "debug/policy_chosen_logits": -1.7772536277770996, "debug/policy_chosen_logps": -251.26417541503906, "debug/policy_rejected_logits": -1.7712210416793823, "debug/policy_rejected_logps": -224.58258056640625, "debug/reference_chosen_logps": -251.5955047607422, "debug/reference_rejected_logps": -224.013427734375, "epoch": 0.14285714285714285, "grad_norm": 4.746380335819719, "learning_rate": 1e-06, "logits/chosen": -1.7772536277770996, "logits/rejected": -1.7712210416793823, "logps/chosen": -251.26417541503906, "logps/rejected": -224.58258056640625, "loss": 0.5024, "rewards/accuracies": 0.75, "rewards/chosen": 0.0033132743556052446, "rewards/margins": 0.009004650637507439, "rewards/rejected": -0.005691375583410263, "step": 6 }, { "debug/policy_chosen_logits": -1.8587639331817627, "debug/policy_chosen_logps": -215.83526611328125, "debug/policy_rejected_logits": -1.830519437789917, "debug/policy_rejected_logps": -228.4679412841797, "debug/reference_chosen_logps": -215.88973999023438, "debug/reference_rejected_logps": -228.85028076171875, "epoch": 0.16666666666666666, "grad_norm": 4.703836780587687, "learning_rate": 1e-06, "logits/chosen": -1.8587639331817627, "logits/rejected": -1.830519437789917, "logps/chosen": -215.83526611328125, "logps/rejected": -228.4679412841797, "loss": 0.4979, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005447575822472572, "rewards/margins": -0.0032785222865641117, "rewards/rejected": 0.0038232803344726562, "step": 7 }, { "debug/policy_chosen_logits": -1.713200569152832, "debug/policy_chosen_logps": -214.9654083251953, "debug/policy_rejected_logits": -1.6877814531326294, "debug/policy_rejected_logps": -237.10147094726562, "debug/reference_chosen_logps": -215.1925811767578, "debug/reference_rejected_logps": -237.26712036132812, "epoch": 0.19047619047619047, "grad_norm": 4.614150814199241, "learning_rate": 1e-06, "logits/chosen": -1.713200569152832, "logits/rejected": -1.6877814531326294, "logps/chosen": -214.9654083251953, "logps/rejected": -237.10147094726562, "loss": 0.4993, "rewards/accuracies": 0.5, "rewards/chosen": 0.0022717281244695187, "rewards/margins": 0.0006152723217383027, "rewards/rejected": 0.0016564556863158941, "step": 8 }, { "debug/policy_chosen_logits": -1.830588459968567, "debug/policy_chosen_logps": -264.9453430175781, "debug/policy_rejected_logits": -1.7403862476348877, "debug/policy_rejected_logps": -236.23504638671875, "debug/reference_chosen_logps": -265.33489990234375, "debug/reference_rejected_logps": -236.4605712890625, "epoch": 0.21428571428571427, "grad_norm": 4.507439835242966, "learning_rate": 1e-06, "logits/chosen": -1.830588459968567, "logits/rejected": -1.7403862476348877, "logps/chosen": -264.9453430175781, "logps/rejected": -236.23504638671875, "loss": 0.498, "rewards/accuracies": 0.5, "rewards/chosen": 0.003895435482263565, "rewards/margins": 0.0016400336753576994, "rewards/rejected": 0.002255401574075222, "step": 9 }, { "debug/policy_chosen_logits": -1.7635490894317627, "debug/policy_chosen_logps": -212.54888916015625, "debug/policy_rejected_logits": -1.7577309608459473, "debug/policy_rejected_logps": -231.78260803222656, "debug/reference_chosen_logps": -212.55520629882812, "debug/reference_rejected_logps": -231.5526885986328, "epoch": 0.23809523809523808, "grad_norm": 4.646994116106454, "learning_rate": 1e-06, "logits/chosen": -1.7635490894317627, "logits/rejected": -1.7577309608459473, "logps/chosen": -212.54888916015625, "logps/rejected": -231.78260803222656, "loss": 0.4997, "rewards/accuracies": 0.625, "rewards/chosen": 6.330502219498158e-05, "rewards/margins": 0.0023624992463737726, "rewards/rejected": -0.002299194224178791, "step": 10 }, { "debug/policy_chosen_logits": -1.8275775909423828, "debug/policy_chosen_logps": -223.30955505371094, "debug/policy_rejected_logits": -1.7878596782684326, "debug/policy_rejected_logps": -228.84654235839844, "debug/reference_chosen_logps": -223.7688751220703, "debug/reference_rejected_logps": -228.8212890625, "epoch": 0.2619047619047619, "grad_norm": 4.501453979601088, "learning_rate": 1e-06, "logits/chosen": -1.8275775909423828, "logits/rejected": -1.7878596782684326, "logps/chosen": -223.30955505371094, "logps/rejected": -228.84654235839844, "loss": 0.4975, "rewards/accuracies": 0.625, "rewards/chosen": 0.004593219608068466, "rewards/margins": 0.004845847375690937, "rewards/rejected": -0.0002526284661144018, "step": 11 }, { "debug/policy_chosen_logits": -1.8328152894973755, "debug/policy_chosen_logps": -229.270751953125, "debug/policy_rejected_logits": -1.7323297262191772, "debug/policy_rejected_logps": -245.7848358154297, "debug/reference_chosen_logps": -231.10006713867188, "debug/reference_rejected_logps": -244.46746826171875, "epoch": 0.2857142857142857, "grad_norm": 4.605283271613626, "learning_rate": 1e-06, "logits/chosen": -1.8328152894973755, "logits/rejected": -1.7323297262191772, "logps/chosen": -229.270751953125, "logps/rejected": -245.7848358154297, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": 0.018293190747499466, "rewards/margins": 0.03146692365407944, "rewards/rejected": -0.013173731975257397, "step": 12 }, { "debug/policy_chosen_logits": -1.7267065048217773, "debug/policy_chosen_logps": -218.44247436523438, "debug/policy_rejected_logits": -1.6951403617858887, "debug/policy_rejected_logps": -240.42892456054688, "debug/reference_chosen_logps": -217.90109252929688, "debug/reference_rejected_logps": -239.02613830566406, "epoch": 0.30952380952380953, "grad_norm": 4.941489045777509, "learning_rate": 1e-06, "logits/chosen": -1.7267065048217773, "logits/rejected": -1.6951403617858887, "logps/chosen": -218.44247436523438, "logps/rejected": -240.42892456054688, "loss": 0.4982, "rewards/accuracies": 0.375, "rewards/chosen": -0.005413799546658993, "rewards/margins": 0.008614081889390945, "rewards/rejected": -0.014027881436049938, "step": 13 }, { "debug/policy_chosen_logits": -1.795722484588623, "debug/policy_chosen_logps": -235.3023681640625, "debug/policy_rejected_logits": -1.7265783548355103, "debug/policy_rejected_logps": -227.51165771484375, "debug/reference_chosen_logps": -233.77352905273438, "debug/reference_rejected_logps": -225.99020385742188, "epoch": 0.3333333333333333, "grad_norm": 5.251200069837007, "learning_rate": 1e-06, "logits/chosen": -1.795722484588623, "logits/rejected": -1.7265783548355103, "logps/chosen": -235.3023681640625, "logps/rejected": -227.51165771484375, "loss": 0.4978, "rewards/accuracies": 0.5, "rewards/chosen": -0.015288218855857849, "rewards/margins": -7.373839616775513e-05, "rewards/rejected": -0.015214480459690094, "step": 14 }, { "debug/policy_chosen_logits": -1.7187248468399048, "debug/policy_chosen_logps": -219.02468872070312, "debug/policy_rejected_logits": -1.710004210472107, "debug/policy_rejected_logps": -239.28665161132812, "debug/reference_chosen_logps": -218.7939453125, "debug/reference_rejected_logps": -236.74258422851562, "epoch": 0.35714285714285715, "grad_norm": 5.232970784816434, "learning_rate": 1e-06, "logits/chosen": -1.7187248468399048, "logits/rejected": -1.710004210472107, "logps/chosen": -219.02468872070312, "logps/rejected": -239.28665161132812, "loss": 0.4967, "rewards/accuracies": 0.875, "rewards/chosen": -0.002307548187673092, "rewards/margins": 0.023132991045713425, "rewards/rejected": -0.025440538302063942, "step": 15 }, { "debug/policy_chosen_logits": -1.7510279417037964, "debug/policy_chosen_logps": -213.79452514648438, "debug/policy_rejected_logits": -1.6364609003067017, "debug/policy_rejected_logps": -230.83267211914062, "debug/reference_chosen_logps": -216.0415802001953, "debug/reference_rejected_logps": -228.79180908203125, "epoch": 0.38095238095238093, "grad_norm": 4.507041423446427, "learning_rate": 1e-06, "logits/chosen": -1.7510279417037964, "logits/rejected": -1.6364609003067017, "logps/chosen": -213.79452514648438, "logps/rejected": -230.83267211914062, "loss": 0.4978, "rewards/accuracies": 0.875, "rewards/chosen": 0.022470567375421524, "rewards/margins": 0.04287933558225632, "rewards/rejected": -0.020408764481544495, "step": 16 }, { "debug/policy_chosen_logits": -1.7066298723220825, "debug/policy_chosen_logps": -234.27340698242188, "debug/policy_rejected_logits": -1.7131836414337158, "debug/policy_rejected_logps": -234.25454711914062, "debug/reference_chosen_logps": -235.27389526367188, "debug/reference_rejected_logps": -234.8538818359375, "epoch": 0.40476190476190477, "grad_norm": 4.798628310057689, "learning_rate": 1e-06, "logits/chosen": -1.7066298723220825, "logits/rejected": -1.7131836414337158, "logps/chosen": -234.27340698242188, "logps/rejected": -234.25454711914062, "loss": 0.5, "rewards/accuracies": 0.5, "rewards/chosen": 0.010004977695643902, "rewards/margins": 0.004011764191091061, "rewards/rejected": 0.005993213504552841, "step": 17 }, { "debug/policy_chosen_logits": -1.7528653144836426, "debug/policy_chosen_logps": -251.7496337890625, "debug/policy_rejected_logits": -1.561893343925476, "debug/policy_rejected_logps": -227.67825317382812, "debug/reference_chosen_logps": -250.92552185058594, "debug/reference_rejected_logps": -227.5825958251953, "epoch": 0.42857142857142855, "grad_norm": 4.585902807542465, "learning_rate": 1e-06, "logits/chosen": -1.7528653144836426, "logits/rejected": -1.561893343925476, "logps/chosen": -251.7496337890625, "logps/rejected": -227.67825317382812, "loss": 0.4927, "rewards/accuracies": 0.5, "rewards/chosen": -0.008241119794547558, "rewards/margins": -0.007284717168658972, "rewards/rejected": -0.0009564019273966551, "step": 18 }, { "debug/policy_chosen_logits": -1.7094969749450684, "debug/policy_chosen_logps": -218.6276397705078, "debug/policy_rejected_logits": -1.6478347778320312, "debug/policy_rejected_logps": -222.29495239257812, "debug/reference_chosen_logps": -218.80328369140625, "debug/reference_rejected_logps": -223.06387329101562, "epoch": 0.4523809523809524, "grad_norm": 4.499072587904488, "learning_rate": 1e-06, "logits/chosen": -1.7094969749450684, "logits/rejected": -1.6478347778320312, "logps/chosen": -218.6276397705078, "logps/rejected": -222.29495239257812, "loss": 0.4999, "rewards/accuracies": 0.5, "rewards/chosen": 0.001756363082677126, "rewards/margins": -0.0059328461065888405, "rewards/rejected": 0.007689208723604679, "step": 19 }, { "debug/policy_chosen_logits": -1.7157065868377686, "debug/policy_chosen_logps": -233.549560546875, "debug/policy_rejected_logits": -1.6739444732666016, "debug/policy_rejected_logps": -228.99595642089844, "debug/reference_chosen_logps": -234.72760009765625, "debug/reference_rejected_logps": -227.83563232421875, "epoch": 0.47619047619047616, "grad_norm": 4.575115483844653, "learning_rate": 1e-06, "logits/chosen": -1.7157065868377686, "logits/rejected": -1.6739444732666016, "logps/chosen": -233.549560546875, "logps/rejected": -228.99595642089844, "loss": 0.4987, "rewards/accuracies": 0.5, "rewards/chosen": 0.011780375614762306, "rewards/margins": 0.023383673280477524, "rewards/rejected": -0.011603298597037792, "step": 20 }, { "debug/policy_chosen_logits": -1.6785526275634766, "debug/policy_chosen_logps": -229.71510314941406, "debug/policy_rejected_logits": -1.5240322351455688, "debug/policy_rejected_logps": -219.5631103515625, "debug/reference_chosen_logps": -229.66180419921875, "debug/reference_rejected_logps": -219.04263305664062, "epoch": 0.5, "grad_norm": 4.692408631629601, "learning_rate": 1e-06, "logits/chosen": -1.6785526275634766, "logits/rejected": -1.5240322351455688, "logps/chosen": -229.71510314941406, "logps/rejected": -219.5631103515625, "loss": 0.504, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005329509731382132, "rewards/margins": 0.004671857692301273, "rewards/rejected": -0.00520480889827013, "step": 21 }, { "debug/policy_chosen_logits": -1.7138959169387817, "debug/policy_chosen_logps": -258.2476806640625, "debug/policy_rejected_logits": -1.6827083826065063, "debug/policy_rejected_logps": -217.47686767578125, "debug/reference_chosen_logps": -258.50408935546875, "debug/reference_rejected_logps": -218.35800170898438, "epoch": 0.5238095238095238, "grad_norm": 5.349249118587083, "learning_rate": 1e-06, "logits/chosen": -1.7138959169387817, "logits/rejected": -1.6827083826065063, "logps/chosen": -258.2476806640625, "logps/rejected": -217.47686767578125, "loss": 0.5027, "rewards/accuracies": 0.375, "rewards/chosen": 0.0025640674866735935, "rewards/margins": -0.00624719588086009, "rewards/rejected": 0.008811263367533684, "step": 22 }, { "debug/policy_chosen_logits": -1.7912077903747559, "debug/policy_chosen_logps": -216.38299560546875, "debug/policy_rejected_logits": -1.8135955333709717, "debug/policy_rejected_logps": -212.30813598632812, "debug/reference_chosen_logps": -217.41909790039062, "debug/reference_rejected_logps": -213.4788055419922, "epoch": 0.5476190476190477, "grad_norm": 5.700609487442719, "learning_rate": 1e-06, "logits/chosen": -1.7912077903747559, "logits/rejected": -1.8135955333709717, "logps/chosen": -216.38299560546875, "logps/rejected": -212.30813598632812, "loss": 0.5012, "rewards/accuracies": 0.375, "rewards/chosen": 0.010360946878790855, "rewards/margins": -0.0013457299210131168, "rewards/rejected": 0.011706676334142685, "step": 23 }, { "debug/policy_chosen_logits": -1.7215830087661743, "debug/policy_chosen_logps": -224.13088989257812, "debug/policy_rejected_logits": -1.7539652585983276, "debug/policy_rejected_logps": -252.66017150878906, "debug/reference_chosen_logps": -223.94528198242188, "debug/reference_rejected_logps": -251.1186065673828, "epoch": 0.5714285714285714, "grad_norm": 4.945978414250695, "learning_rate": 1e-06, "logits/chosen": -1.7215830087661743, "logits/rejected": -1.7539652585983276, "logps/chosen": -224.13088989257812, "logps/rejected": -252.66017150878906, "loss": 0.4913, "rewards/accuracies": 0.625, "rewards/chosen": -0.0018562317127361894, "rewards/margins": 0.013559339568018913, "rewards/rejected": -0.015415573492646217, "step": 24 }, { "debug/policy_chosen_logits": -1.7326081991195679, "debug/policy_chosen_logps": -226.6582794189453, "debug/policy_rejected_logits": -1.701865553855896, "debug/policy_rejected_logps": -225.5404052734375, "debug/reference_chosen_logps": -227.42691040039062, "debug/reference_rejected_logps": -224.72911071777344, "epoch": 0.5952380952380952, "grad_norm": 5.05266825293041, "learning_rate": 1e-06, "logits/chosen": -1.7326081991195679, "logits/rejected": -1.701865553855896, "logps/chosen": -226.6582794189453, "logps/rejected": -225.5404052734375, "loss": 0.4988, "rewards/accuracies": 0.75, "rewards/chosen": 0.007686404511332512, "rewards/margins": 0.01579955965280533, "rewards/rejected": -0.008113155141472816, "step": 25 }, { "debug/policy_chosen_logits": -1.797877550125122, "debug/policy_chosen_logps": -232.49713134765625, "debug/policy_rejected_logits": -1.7819045782089233, "debug/policy_rejected_logps": -239.3779296875, "debug/reference_chosen_logps": -231.29766845703125, "debug/reference_rejected_logps": -239.3263702392578, "epoch": 0.6190476190476191, "grad_norm": 5.6239784950743745, "learning_rate": 1e-06, "logits/chosen": -1.797877550125122, "logits/rejected": -1.7819045782089233, "logps/chosen": -232.49713134765625, "logps/rejected": -239.3779296875, "loss": 0.4983, "rewards/accuracies": 0.375, "rewards/chosen": -0.011994647793471813, "rewards/margins": -0.011478938162326813, "rewards/rejected": -0.0005157091654837132, "step": 26 }, { "debug/policy_chosen_logits": -1.6310824155807495, "debug/policy_chosen_logps": -223.48495483398438, "debug/policy_rejected_logits": -1.5401698350906372, "debug/policy_rejected_logps": -237.32968139648438, "debug/reference_chosen_logps": -225.53250122070312, "debug/reference_rejected_logps": -237.51010131835938, "epoch": 0.6428571428571429, "grad_norm": 5.775503509441982, "learning_rate": 1e-06, "logits/chosen": -1.6310824155807495, "logits/rejected": -1.5401698350906372, "logps/chosen": -223.48495483398438, "logps/rejected": -237.32968139648438, "loss": 0.4974, "rewards/accuracies": 0.625, "rewards/chosen": 0.020475387573242188, "rewards/margins": 0.018671227619051933, "rewards/rejected": 0.0018041613511741161, "step": 27 }, { "debug/policy_chosen_logits": -1.8868080377578735, "debug/policy_chosen_logps": -228.5826416015625, "debug/policy_rejected_logits": -1.640351414680481, "debug/policy_rejected_logps": -232.02215576171875, "debug/reference_chosen_logps": -227.31100463867188, "debug/reference_rejected_logps": -231.01402282714844, "epoch": 0.6666666666666666, "grad_norm": 4.615984568922464, "learning_rate": 1e-06, "logits/chosen": -1.8868080377578735, "logits/rejected": -1.640351414680481, "logps/chosen": -228.5826416015625, "logps/rejected": -232.02215576171875, "loss": 0.5044, "rewards/accuracies": 0.375, "rewards/chosen": -0.012716369703412056, "rewards/margins": -0.0026350021362304688, "rewards/rejected": -0.010081367567181587, "step": 28 }, { "debug/policy_chosen_logits": -1.6878303289413452, "debug/policy_chosen_logps": -207.95928955078125, "debug/policy_rejected_logits": -1.6597003936767578, "debug/policy_rejected_logps": -203.81204223632812, "debug/reference_chosen_logps": -209.3488006591797, "debug/reference_rejected_logps": -204.5792694091797, "epoch": 0.6904761904761905, "grad_norm": 5.141749172246753, "learning_rate": 1e-06, "logits/chosen": -1.6878303289413452, "logits/rejected": -1.6597003936767578, "logps/chosen": -207.95928955078125, "logps/rejected": -203.81204223632812, "loss": 0.5038, "rewards/accuracies": 0.625, "rewards/chosen": 0.013895034790039062, "rewards/margins": 0.006222839932888746, "rewards/rejected": 0.0076721953228116035, "step": 29 }, { "debug/policy_chosen_logits": -1.654941201210022, "debug/policy_chosen_logps": -214.6881866455078, "debug/policy_rejected_logits": -1.7197308540344238, "debug/policy_rejected_logps": -230.4603271484375, "debug/reference_chosen_logps": -213.96749877929688, "debug/reference_rejected_logps": -229.0462188720703, "epoch": 0.7142857142857143, "grad_norm": 4.802743085723465, "learning_rate": 1e-06, "logits/chosen": -1.654941201210022, "logits/rejected": -1.7197308540344238, "logps/chosen": -214.6881866455078, "logps/rejected": -230.4603271484375, "loss": 0.4929, "rewards/accuracies": 0.5, "rewards/chosen": -0.0072069549933075905, "rewards/margins": 0.006934070959687233, "rewards/rejected": -0.014141025952994823, "step": 30 }, { "debug/policy_chosen_logits": -1.5828349590301514, "debug/policy_chosen_logps": -237.3705596923828, "debug/policy_rejected_logits": -1.5794568061828613, "debug/policy_rejected_logps": -254.6802978515625, "debug/reference_chosen_logps": -235.51742553710938, "debug/reference_rejected_logps": -251.95379638671875, "epoch": 0.7380952380952381, "grad_norm": 5.710393511185945, "learning_rate": 1e-06, "logits/chosen": -1.5828349590301514, "logits/rejected": -1.5794568061828613, "logps/chosen": -237.3705596923828, "logps/rejected": -254.6802978515625, "loss": 0.4965, "rewards/accuracies": 0.5, "rewards/chosen": -0.0185314379632473, "rewards/margins": 0.00873336661607027, "rewards/rejected": -0.027264803647994995, "step": 31 }, { "debug/policy_chosen_logits": -1.8172823190689087, "debug/policy_chosen_logps": -233.02899169921875, "debug/policy_rejected_logits": -1.8409276008605957, "debug/policy_rejected_logps": -226.48519897460938, "debug/reference_chosen_logps": -232.89686584472656, "debug/reference_rejected_logps": -226.06564331054688, "epoch": 0.7619047619047619, "grad_norm": 4.735509214480415, "learning_rate": 1e-06, "logits/chosen": -1.8172823190689087, "logits/rejected": -1.8409276008605957, "logps/chosen": -233.02899169921875, "logps/rejected": -226.48519897460938, "loss": 0.5039, "rewards/accuracies": 0.75, "rewards/chosen": -0.0013212203048169613, "rewards/margins": 0.0028741275891661644, "rewards/rejected": -0.0041953460313379765, "step": 32 }, { "debug/policy_chosen_logits": -1.7416476011276245, "debug/policy_chosen_logps": -210.79525756835938, "debug/policy_rejected_logits": -1.65560781955719, "debug/policy_rejected_logps": -239.12855529785156, "debug/reference_chosen_logps": -209.15957641601562, "debug/reference_rejected_logps": -239.18234252929688, "epoch": 0.7857142857142857, "grad_norm": 4.895068163607816, "learning_rate": 1e-06, "logits/chosen": -1.7416476011276245, "logits/rejected": -1.65560781955719, "logps/chosen": -210.79525756835938, "logps/rejected": -239.12855529785156, "loss": 0.5056, "rewards/accuracies": 0.375, "rewards/chosen": -0.016356829553842545, "rewards/margins": -0.016894912347197533, "rewards/rejected": 0.0005380823276937008, "step": 33 }, { "debug/policy_chosen_logits": -1.7726426124572754, "debug/policy_chosen_logps": -225.79507446289062, "debug/policy_rejected_logits": -1.7965084314346313, "debug/policy_rejected_logps": -220.8616943359375, "debug/reference_chosen_logps": -225.74952697753906, "debug/reference_rejected_logps": -221.5814666748047, "epoch": 0.8095238095238095, "grad_norm": 5.792452647884142, "learning_rate": 1e-06, "logits/chosen": -1.7726426124572754, "logits/rejected": -1.7965084314346313, "logps/chosen": -225.79507446289062, "logps/rejected": -220.8616943359375, "loss": 0.4985, "rewards/accuracies": 0.375, "rewards/chosen": -0.00045545492321252823, "rewards/margins": -0.007653160020709038, "rewards/rejected": 0.007197704166173935, "step": 34 }, { "debug/policy_chosen_logits": -1.6315754652023315, "debug/policy_chosen_logps": -234.38040161132812, "debug/policy_rejected_logits": -1.613837718963623, "debug/policy_rejected_logps": -246.2030029296875, "debug/reference_chosen_logps": -236.5584716796875, "debug/reference_rejected_logps": -244.6172332763672, "epoch": 0.8333333333333334, "grad_norm": 5.023497019038571, "learning_rate": 1e-06, "logits/chosen": -1.6315754652023315, "logits/rejected": -1.613837718963623, "logps/chosen": -234.38040161132812, "logps/rejected": -246.2030029296875, "loss": 0.4951, "rewards/accuracies": 0.75, "rewards/chosen": 0.021780777722597122, "rewards/margins": 0.0376385897397995, "rewards/rejected": -0.015857810154557228, "step": 35 }, { "debug/policy_chosen_logits": -1.733331561088562, "debug/policy_chosen_logps": -232.724609375, "debug/policy_rejected_logits": -1.7223798036575317, "debug/policy_rejected_logps": -222.6837158203125, "debug/reference_chosen_logps": -233.48643493652344, "debug/reference_rejected_logps": -222.51834106445312, "epoch": 0.8571428571428571, "grad_norm": 4.953158777901106, "learning_rate": 1e-06, "logits/chosen": -1.733331561088562, "logits/rejected": -1.7223798036575317, "logps/chosen": -232.724609375, "logps/rejected": -222.6837158203125, "loss": 0.4898, "rewards/accuracies": 0.625, "rewards/chosen": 0.00761817954480648, "rewards/margins": 0.009272022172808647, "rewards/rejected": -0.0016538426280021667, "step": 36 }, { "debug/policy_chosen_logits": -1.859521508216858, "debug/policy_chosen_logps": -225.0269775390625, "debug/policy_rejected_logits": -1.8794121742248535, "debug/policy_rejected_logps": -238.612548828125, "debug/reference_chosen_logps": -226.9108428955078, "debug/reference_rejected_logps": -235.47463989257812, "epoch": 0.8809523809523809, "grad_norm": 4.654227939974065, "learning_rate": 1e-06, "logits/chosen": -1.859521508216858, "logits/rejected": -1.8794121742248535, "logps/chosen": -225.0269775390625, "logps/rejected": -238.612548828125, "loss": 0.4792, "rewards/accuracies": 0.5, "rewards/chosen": 0.01883869245648384, "rewards/margins": 0.05021755024790764, "rewards/rejected": -0.031378861516714096, "step": 37 }, { "debug/policy_chosen_logits": -1.8789831399917603, "debug/policy_chosen_logps": -208.9774169921875, "debug/policy_rejected_logits": -1.8391132354736328, "debug/policy_rejected_logps": -209.525390625, "debug/reference_chosen_logps": -207.11997985839844, "debug/reference_rejected_logps": -208.88546752929688, "epoch": 0.9047619047619048, "grad_norm": 5.750158702312609, "learning_rate": 1e-06, "logits/chosen": -1.8789831399917603, "logits/rejected": -1.8391132354736328, "logps/chosen": -208.9774169921875, "logps/rejected": -209.525390625, "loss": 0.4916, "rewards/accuracies": 0.25, "rewards/chosen": -0.018574314191937447, "rewards/margins": -0.012175025418400764, "rewards/rejected": -0.0063992878422141075, "step": 38 }, { "debug/policy_chosen_logits": -1.8669378757476807, "debug/policy_chosen_logps": -256.5040283203125, "debug/policy_rejected_logits": -1.771195411682129, "debug/policy_rejected_logps": -240.91348266601562, "debug/reference_chosen_logps": -256.64874267578125, "debug/reference_rejected_logps": -240.50926208496094, "epoch": 0.9285714285714286, "grad_norm": 5.01330786712291, "learning_rate": 1e-06, "logits/chosen": -1.8669378757476807, "logits/rejected": -1.771195411682129, "logps/chosen": -256.5040283203125, "logps/rejected": -240.91348266601562, "loss": 0.4963, "rewards/accuracies": 0.5, "rewards/chosen": 0.0014472389593720436, "rewards/margins": 0.005489482544362545, "rewards/rejected": -0.004042243584990501, "step": 39 }, { "debug/policy_chosen_logits": -1.8057416677474976, "debug/policy_chosen_logps": -199.05746459960938, "debug/policy_rejected_logits": -1.72264564037323, "debug/policy_rejected_logps": -230.5078125, "debug/reference_chosen_logps": -198.6903533935547, "debug/reference_rejected_logps": -230.05313110351562, "epoch": 0.9523809523809523, "grad_norm": 4.6267949338716425, "learning_rate": 1e-06, "logits/chosen": -1.8057416677474976, "logits/rejected": -1.72264564037323, "logps/chosen": -199.05746459960938, "logps/rejected": -230.5078125, "loss": 0.5011, "rewards/accuracies": 0.375, "rewards/chosen": -0.0036712647415697575, "rewards/margins": 0.0008756828028708696, "rewards/rejected": -0.004546947777271271, "step": 40 }, { "debug/policy_chosen_logits": -1.7298365831375122, "debug/policy_chosen_logps": -215.87039184570312, "debug/policy_rejected_logits": -1.7618889808654785, "debug/policy_rejected_logps": -218.1046600341797, "debug/reference_chosen_logps": -216.2220001220703, "debug/reference_rejected_logps": -218.1280059814453, "epoch": 0.9761904761904762, "grad_norm": 5.043714508829569, "learning_rate": 1e-06, "logits/chosen": -1.7298365831375122, "logits/rejected": -1.7618889808654785, "logps/chosen": -215.87039184570312, "logps/rejected": -218.1046600341797, "loss": 0.4946, "rewards/accuracies": 0.625, "rewards/chosen": 0.003516044234856963, "rewards/margins": 0.003282546764239669, "rewards/rejected": 0.00023349723778665066, "step": 41 }, { "debug/policy_chosen_logits": -1.9528135061264038, "debug/policy_chosen_logps": -210.9334259033203, "debug/policy_rejected_logits": -1.8600839376449585, "debug/policy_rejected_logps": -225.0123291015625, "debug/reference_chosen_logps": -212.72988891601562, "debug/reference_rejected_logps": -225.8553924560547, "epoch": 1.0, "grad_norm": 5.608640172933804, "learning_rate": 1e-06, "logits/chosen": -1.9528135061264038, "logits/rejected": -1.8600839376449585, "logps/chosen": -210.9334259033203, "logps/rejected": -225.0123291015625, "loss": 0.468, "rewards/accuracies": 0.75, "rewards/chosen": 0.017964612692594528, "rewards/margins": 0.009534033946692944, "rewards/rejected": 0.00843057781457901, "step": 42 }, { "epoch": 1.0, "step": 42, "total_flos": 0.0, "train_loss": 0.49727832632405417, "train_runtime": 148.4721, "train_samples_per_second": 17.822, "train_steps_per_second": 0.283 } ], "logging_steps": 1, "max_steps": 42, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }