{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 64, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -0.9758062958717346, "debug/policy_chosen_logps": -151.53146362304688, "debug/policy_rejected_logits": -0.9790539741516113, "debug/policy_rejected_logps": -147.30738830566406, "debug/reference_chosen_logps": -151.53146362304688, "debug/reference_rejected_logps": -147.30738830566406, "epoch": 0.015625, "grad_norm": 7.077985294284826, "learning_rate": 1e-06, "logits/chosen": -0.9758062958717346, "logits/rejected": -0.9790539741516113, "logps/chosen": -151.53146362304688, "logps/rejected": -147.30738830566406, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -0.9311444759368896, "debug/policy_chosen_logps": -157.56246948242188, "debug/policy_rejected_logits": -0.6852308511734009, "debug/policy_rejected_logps": -181.45333862304688, "debug/reference_chosen_logps": -157.81744384765625, "debug/reference_rejected_logps": -181.56724548339844, "epoch": 0.03125, "grad_norm": 5.504247352373829, "learning_rate": 1e-06, "logits/chosen": -0.9311444759368896, "logits/rejected": -0.6852308511734009, "logps/chosen": -157.56246948242188, "logps/rejected": -181.45333862304688, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": 0.002549667377024889, "rewards/margins": 0.0014108085306361318, "rewards/rejected": 0.0011388587299734354, "step": 2 }, { "debug/policy_chosen_logits": -0.8895314335823059, "debug/policy_chosen_logps": -195.25637817382812, "debug/policy_rejected_logits": -0.575265109539032, "debug/policy_rejected_logps": -194.01516723632812, "debug/reference_chosen_logps": -195.29818725585938, "debug/reference_rejected_logps": -194.30865478515625, "epoch": 0.046875, "grad_norm": 5.929427478772651, "learning_rate": 1e-06, "logits/chosen": -0.8895314335823059, "logits/rejected": -0.575265109539032, "logps/chosen": -195.25637817382812, "logps/rejected": -194.01516723632812, "loss": 0.5, "rewards/accuracies": 0.5, "rewards/chosen": 0.00041794776916503906, "rewards/margins": -0.00251706107519567, "rewards/rejected": 0.002935009077191353, "step": 3 }, { "debug/policy_chosen_logits": -1.1189931631088257, "debug/policy_chosen_logps": -143.90708923339844, "debug/policy_rejected_logits": -0.9135940670967102, "debug/policy_rejected_logps": -157.21446228027344, "debug/reference_chosen_logps": -144.0689697265625, "debug/reference_rejected_logps": -156.4416046142578, "epoch": 0.0625, "grad_norm": 6.2120667679725585, "learning_rate": 1e-06, "logits/chosen": -1.1189931631088257, "logits/rejected": -0.9135940670967102, "logps/chosen": -143.90708923339844, "logps/rejected": -157.21446228027344, "loss": 0.4959, "rewards/accuracies": 0.875, "rewards/chosen": 0.0016189860180020332, "rewards/margins": 0.009347562678158283, "rewards/rejected": -0.00772857666015625, "step": 4 }, { "debug/policy_chosen_logits": -0.8267368674278259, "debug/policy_chosen_logps": -183.93475341796875, "debug/policy_rejected_logits": -0.761789858341217, "debug/policy_rejected_logps": -177.76959228515625, "debug/reference_chosen_logps": -183.55718994140625, "debug/reference_rejected_logps": -176.53091430664062, "epoch": 0.078125, "grad_norm": 6.227908126221475, "learning_rate": 1e-06, "logits/chosen": -0.8267368674278259, "logits/rejected": -0.761789858341217, "logps/chosen": -183.93475341796875, "logps/rejected": -177.76959228515625, "loss": 0.4983, "rewards/accuracies": 0.625, "rewards/chosen": -0.0037756823003292084, "rewards/margins": 0.008610926568508148, "rewards/rejected": -0.012386607937514782, "step": 5 }, { "debug/policy_chosen_logits": -1.1983323097229004, "debug/policy_chosen_logps": -134.5792236328125, "debug/policy_rejected_logits": -1.1904683113098145, "debug/policy_rejected_logps": -152.11712646484375, "debug/reference_chosen_logps": -135.0609130859375, "debug/reference_rejected_logps": -151.68475341796875, "epoch": 0.09375, "grad_norm": 6.006130436499243, "learning_rate": 1e-06, "logits/chosen": -1.1983323097229004, "logits/rejected": -1.1904683113098145, "logps/chosen": -134.5792236328125, "logps/rejected": -152.11712646484375, "loss": 0.4928, "rewards/accuracies": 0.5, "rewards/chosen": 0.004816893953830004, "rewards/margins": 0.009140652604401112, "rewards/rejected": -0.004323759116232395, "step": 6 }, { "debug/policy_chosen_logits": -1.0346052646636963, "debug/policy_chosen_logps": -134.0786895751953, "debug/policy_rejected_logits": -0.8808152079582214, "debug/policy_rejected_logps": -155.0458984375, "debug/reference_chosen_logps": -135.0550537109375, "debug/reference_rejected_logps": -154.17298889160156, "epoch": 0.109375, "grad_norm": 5.440760637907165, "learning_rate": 1e-06, "logits/chosen": -1.0346052646636963, "logits/rejected": -0.8808152079582214, "logps/chosen": -134.0786895751953, "logps/rejected": -155.0458984375, "loss": 0.4961, "rewards/accuracies": 0.5, "rewards/chosen": 0.009763631038367748, "rewards/margins": 0.018492689356207848, "rewards/rejected": -0.008729057386517525, "step": 7 }, { "debug/policy_chosen_logits": -0.9342319369316101, "debug/policy_chosen_logps": -137.67352294921875, "debug/policy_rejected_logits": -0.9422162771224976, "debug/policy_rejected_logps": -163.11819458007812, "debug/reference_chosen_logps": -138.39669799804688, "debug/reference_rejected_logps": -162.8925018310547, "epoch": 0.125, "grad_norm": 5.917503926417565, "learning_rate": 1e-06, "logits/chosen": -0.9342319369316101, "logits/rejected": -0.9422162771224976, "logps/chosen": -137.67352294921875, "logps/rejected": -163.11819458007812, "loss": 0.4951, "rewards/accuracies": 0.75, "rewards/chosen": 0.007231750525534153, "rewards/margins": 0.009488582611083984, "rewards/rejected": -0.002256832318380475, "step": 8 }, { "debug/policy_chosen_logits": -0.7332696914672852, "debug/policy_chosen_logps": -188.66183471679688, "debug/policy_rejected_logits": -0.5404833555221558, "debug/policy_rejected_logps": -211.7197723388672, "debug/reference_chosen_logps": -187.5482635498047, "debug/reference_rejected_logps": -209.54415893554688, "epoch": 0.140625, "grad_norm": 5.9550182446271185, "learning_rate": 1e-06, "logits/chosen": -0.7332696914672852, "logits/rejected": -0.5404833555221558, "logps/chosen": -188.66183471679688, "logps/rejected": -211.7197723388672, "loss": 0.4949, "rewards/accuracies": 0.625, "rewards/chosen": -0.011135692708194256, "rewards/margins": 0.010620327666401863, "rewards/rejected": -0.021756019443273544, "step": 9 }, { "debug/policy_chosen_logits": -0.9110831022262573, "debug/policy_chosen_logps": -148.4445343017578, "debug/policy_rejected_logits": -0.8464857339859009, "debug/policy_rejected_logps": -157.1292724609375, "debug/reference_chosen_logps": -150.76031494140625, "debug/reference_rejected_logps": -156.712890625, "epoch": 0.15625, "grad_norm": 5.99386820607358, "learning_rate": 1e-06, "logits/chosen": -0.9110831022262573, "logits/rejected": -0.8464857339859009, "logps/chosen": -148.4445343017578, "logps/rejected": -157.1292724609375, "loss": 0.492, "rewards/accuracies": 0.625, "rewards/chosen": 0.023157909512519836, "rewards/margins": 0.027321862056851387, "rewards/rejected": -0.004163951613008976, "step": 10 }, { "debug/policy_chosen_logits": -0.7407766580581665, "debug/policy_chosen_logps": -179.63021850585938, "debug/policy_rejected_logits": -0.8468393683433533, "debug/policy_rejected_logps": -164.4591522216797, "debug/reference_chosen_logps": -178.74200439453125, "debug/reference_rejected_logps": -164.0178680419922, "epoch": 0.171875, "grad_norm": 6.56332815982213, "learning_rate": 1e-06, "logits/chosen": -0.7407766580581665, "logits/rejected": -0.8468393683433533, "logps/chosen": -179.63021850585938, "logps/rejected": -164.4591522216797, "loss": 0.4958, "rewards/accuracies": 0.375, "rewards/chosen": -0.008881940506398678, "rewards/margins": -0.0044690510258078575, "rewards/rejected": -0.004412889014929533, "step": 11 }, { "debug/policy_chosen_logits": -0.8543327450752258, "debug/policy_chosen_logps": -173.95263671875, "debug/policy_rejected_logits": -0.7469156980514526, "debug/policy_rejected_logps": -187.60519409179688, "debug/reference_chosen_logps": -174.4696502685547, "debug/reference_rejected_logps": -184.88877868652344, "epoch": 0.1875, "grad_norm": 5.8252129211333825, "learning_rate": 1e-06, "logits/chosen": -0.8543327450752258, "logits/rejected": -0.7469156980514526, "logps/chosen": -173.95263671875, "logps/rejected": -187.60519409179688, "loss": 0.497, "rewards/accuracies": 1.0, "rewards/chosen": 0.005170030519366264, "rewards/margins": 0.03233422338962555, "rewards/rejected": -0.027164191007614136, "step": 12 }, { "debug/policy_chosen_logits": -0.8461030125617981, "debug/policy_chosen_logps": -143.32421875, "debug/policy_rejected_logits": -0.8543170690536499, "debug/policy_rejected_logps": -186.77857971191406, "debug/reference_chosen_logps": -146.84774780273438, "debug/reference_rejected_logps": -186.40374755859375, "epoch": 0.203125, "grad_norm": 5.76637069916284, "learning_rate": 1e-06, "logits/chosen": -0.8461030125617981, "logits/rejected": -0.8543170690536499, "logps/chosen": -143.32421875, "logps/rejected": -186.77857971191406, "loss": 0.4832, "rewards/accuracies": 0.75, "rewards/chosen": 0.03523515537381172, "rewards/margins": 0.03898348659276962, "rewards/rejected": -0.0037483316846191883, "step": 13 }, { "debug/policy_chosen_logits": -0.9689863920211792, "debug/policy_chosen_logps": -154.18991088867188, "debug/policy_rejected_logits": -0.6996111869812012, "debug/policy_rejected_logps": -168.9903564453125, "debug/reference_chosen_logps": -157.5633544921875, "debug/reference_rejected_logps": -168.59417724609375, "epoch": 0.21875, "grad_norm": 5.591725227630854, "learning_rate": 1e-06, "logits/chosen": -0.9689863920211792, "logits/rejected": -0.6996111869812012, "logps/chosen": -154.18991088867188, "logps/rejected": -168.9903564453125, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 0.0337345227599144, "rewards/margins": 0.03769625723361969, "rewards/rejected": -0.003961734939366579, "step": 14 }, { "debug/policy_chosen_logits": -0.9054354429244995, "debug/policy_chosen_logps": -145.04635620117188, "debug/policy_rejected_logits": -0.8977835774421692, "debug/policy_rejected_logps": -163.13455200195312, "debug/reference_chosen_logps": -147.23922729492188, "debug/reference_rejected_logps": -162.36553955078125, "epoch": 0.234375, "grad_norm": 6.04155713949948, "learning_rate": 1e-06, "logits/chosen": -0.9054354429244995, "logits/rejected": -0.8977835774421692, "logps/chosen": -145.04635620117188, "logps/rejected": -163.13455200195312, "loss": 0.4855, "rewards/accuracies": 0.625, "rewards/chosen": 0.0219286996871233, "rewards/margins": 0.02961883321404457, "rewards/rejected": -0.00769013399258256, "step": 15 }, { "debug/policy_chosen_logits": -0.8554494380950928, "debug/policy_chosen_logps": -161.84022521972656, "debug/policy_rejected_logits": -0.8830718994140625, "debug/policy_rejected_logps": -173.75918579101562, "debug/reference_chosen_logps": -164.03298950195312, "debug/reference_rejected_logps": -174.45535278320312, "epoch": 0.25, "grad_norm": 5.970121851694702, "learning_rate": 1e-06, "logits/chosen": -0.8554494380950928, "logits/rejected": -0.8830718994140625, "logps/chosen": -161.84022521972656, "logps/rejected": -173.75918579101562, "loss": 0.4842, "rewards/accuracies": 0.75, "rewards/chosen": 0.021927593275904655, "rewards/margins": 0.014965922571718693, "rewards/rejected": 0.006961670238524675, "step": 16 }, { "debug/policy_chosen_logits": -0.9481765627861023, "debug/policy_chosen_logps": -162.32388305664062, "debug/policy_rejected_logits": -0.7944669127464294, "debug/policy_rejected_logps": -188.93453979492188, "debug/reference_chosen_logps": -163.7265625, "debug/reference_rejected_logps": -185.91177368164062, "epoch": 0.265625, "grad_norm": 5.71178003145209, "learning_rate": 1e-06, "logits/chosen": -0.9481765627861023, "logits/rejected": -0.7944669127464294, "logps/chosen": -162.32388305664062, "logps/rejected": -188.93453979492188, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 0.014026792719960213, "rewards/margins": 0.044254451990127563, "rewards/rejected": -0.0302276611328125, "step": 17 }, { "debug/policy_chosen_logits": -0.6716693043708801, "debug/policy_chosen_logps": -187.95364379882812, "debug/policy_rejected_logits": -0.8150543570518494, "debug/policy_rejected_logps": -169.115478515625, "debug/reference_chosen_logps": -187.42266845703125, "debug/reference_rejected_logps": -166.06893920898438, "epoch": 0.28125, "grad_norm": 6.343140969572851, "learning_rate": 1e-06, "logits/chosen": -0.6716693043708801, "logits/rejected": -0.8150543570518494, "logps/chosen": -187.95364379882812, "logps/rejected": -169.115478515625, "loss": 0.4942, "rewards/accuracies": 0.75, "rewards/chosen": -0.005309591069817543, "rewards/margins": 0.025155849754810333, "rewards/rejected": -0.030465438961982727, "step": 18 }, { "debug/policy_chosen_logits": -0.9492964744567871, "debug/policy_chosen_logps": -158.28807067871094, "debug/policy_rejected_logits": -0.8160263895988464, "debug/policy_rejected_logps": -167.84140014648438, "debug/reference_chosen_logps": -159.18081665039062, "debug/reference_rejected_logps": -166.4847869873047, "epoch": 0.296875, "grad_norm": 5.7981298310331795, "learning_rate": 1e-06, "logits/chosen": -0.9492964744567871, "logits/rejected": -0.8160263895988464, "logps/chosen": -158.28807067871094, "logps/rejected": -167.84140014648438, "loss": 0.4863, "rewards/accuracies": 0.75, "rewards/chosen": 0.008927486836910248, "rewards/margins": 0.022493677213788033, "rewards/rejected": -0.013566188514232635, "step": 19 }, { "debug/policy_chosen_logits": -0.9160640239715576, "debug/policy_chosen_logps": -165.8433837890625, "debug/policy_rejected_logits": -0.8921107053756714, "debug/policy_rejected_logps": -172.75804138183594, "debug/reference_chosen_logps": -164.21383666992188, "debug/reference_rejected_logps": -168.41940307617188, "epoch": 0.3125, "grad_norm": 6.672971626135043, "learning_rate": 1e-06, "logits/chosen": -0.9160640239715576, "logits/rejected": -0.8921107053756714, "logps/chosen": -165.8433837890625, "logps/rejected": -172.75804138183594, "loss": 0.4778, "rewards/accuracies": 0.75, "rewards/chosen": -0.016295330598950386, "rewards/margins": 0.027090895920991898, "rewards/rejected": -0.04338622838258743, "step": 20 }, { "debug/policy_chosen_logits": -0.6745712161064148, "debug/policy_chosen_logps": -169.40414428710938, "debug/policy_rejected_logits": -0.8499138355255127, "debug/policy_rejected_logps": -174.95440673828125, "debug/reference_chosen_logps": -169.4923095703125, "debug/reference_rejected_logps": -175.6483154296875, "epoch": 0.328125, "grad_norm": 5.673305428723763, "learning_rate": 1e-06, "logits/chosen": -0.6745712161064148, "logits/rejected": -0.8499138355255127, "logps/chosen": -169.40414428710938, "logps/rejected": -174.95440673828125, "loss": 0.4998, "rewards/accuracies": 0.375, "rewards/chosen": 0.0008817007765173912, "rewards/margins": -0.006057358346879482, "rewards/rejected": 0.0069390591233968735, "step": 21 }, { "debug/policy_chosen_logits": -0.7490401268005371, "debug/policy_chosen_logps": -180.66146850585938, "debug/policy_rejected_logits": -0.6564124226570129, "debug/policy_rejected_logps": -195.39569091796875, "debug/reference_chosen_logps": -184.63441467285156, "debug/reference_rejected_logps": -196.98995971679688, "epoch": 0.34375, "grad_norm": 6.449944674945484, "learning_rate": 1e-06, "logits/chosen": -0.7490401268005371, "logits/rejected": -0.6564124226570129, "logps/chosen": -180.66146850585938, "logps/rejected": -195.39569091796875, "loss": 0.4655, "rewards/accuracies": 0.625, "rewards/chosen": 0.03972943127155304, "rewards/margins": 0.02378678321838379, "rewards/rejected": 0.0159426499158144, "step": 22 }, { "debug/policy_chosen_logits": -0.6730453372001648, "debug/policy_chosen_logps": -178.2505340576172, "debug/policy_rejected_logits": -0.6955782175064087, "debug/policy_rejected_logps": -174.2504119873047, "debug/reference_chosen_logps": -182.0872802734375, "debug/reference_rejected_logps": -175.9362030029297, "epoch": 0.359375, "grad_norm": 6.173388994966317, "learning_rate": 1e-06, "logits/chosen": -0.6730453372001648, "logits/rejected": -0.6955782175064087, "logps/chosen": -178.2505340576172, "logps/rejected": -174.2504119873047, "loss": 0.4747, "rewards/accuracies": 0.5, "rewards/chosen": 0.038367509841918945, "rewards/margins": 0.021509580314159393, "rewards/rejected": 0.016857929527759552, "step": 23 }, { "debug/policy_chosen_logits": -0.8440623879432678, "debug/policy_chosen_logps": -158.67828369140625, "debug/policy_rejected_logits": -0.8467646837234497, "debug/policy_rejected_logps": -173.41604614257812, "debug/reference_chosen_logps": -160.15481567382812, "debug/reference_rejected_logps": -172.91629028320312, "epoch": 0.375, "grad_norm": 6.519435131661764, "learning_rate": 1e-06, "logits/chosen": -0.8440623879432678, "logits/rejected": -0.8467646837234497, "logps/chosen": -158.67828369140625, "logps/rejected": -173.41604614257812, "loss": 0.4892, "rewards/accuracies": 0.75, "rewards/chosen": 0.0147654814645648, "rewards/margins": 0.01976308785378933, "rewards/rejected": -0.004997607786208391, "step": 24 }, { "debug/policy_chosen_logits": -0.6752190589904785, "debug/policy_chosen_logps": -187.1298065185547, "debug/policy_rejected_logits": -0.7732603549957275, "debug/policy_rejected_logps": -158.61585998535156, "debug/reference_chosen_logps": -190.5185089111328, "debug/reference_rejected_logps": -159.48568725585938, "epoch": 0.390625, "grad_norm": 5.353977426256108, "learning_rate": 1e-06, "logits/chosen": -0.6752190589904785, "logits/rejected": -0.7732603549957275, "logps/chosen": -187.1298065185547, "logps/rejected": -158.61585998535156, "loss": 0.4656, "rewards/accuracies": 0.75, "rewards/chosen": 0.03388698399066925, "rewards/margins": 0.025188742205500603, "rewards/rejected": 0.008698243647813797, "step": 25 }, { "debug/policy_chosen_logits": -0.932550311088562, "debug/policy_chosen_logps": -157.26089477539062, "debug/policy_rejected_logits": -0.8474171161651611, "debug/policy_rejected_logps": -163.1019744873047, "debug/reference_chosen_logps": -154.3992919921875, "debug/reference_rejected_logps": -162.41476440429688, "epoch": 0.40625, "grad_norm": 6.6664666902082494, "learning_rate": 1e-06, "logits/chosen": -0.932550311088562, "logits/rejected": -0.8474171161651611, "logps/chosen": -157.26089477539062, "logps/rejected": -163.1019744873047, "loss": 0.4883, "rewards/accuracies": 0.5, "rewards/chosen": -0.02861594967544079, "rewards/margins": -0.021743860095739365, "rewards/rejected": -0.006872090511023998, "step": 26 }, { "debug/policy_chosen_logits": -0.9092602133750916, "debug/policy_chosen_logps": -178.76705932617188, "debug/policy_rejected_logits": -0.8668628334999084, "debug/policy_rejected_logps": -176.48455810546875, "debug/reference_chosen_logps": -178.7393798828125, "debug/reference_rejected_logps": -174.53854370117188, "epoch": 0.421875, "grad_norm": 6.321703911878928, "learning_rate": 1e-06, "logits/chosen": -0.9092602133750916, "logits/rejected": -0.8668628334999084, "logps/chosen": -178.76705932617188, "logps/rejected": -176.48455810546875, "loss": 0.4628, "rewards/accuracies": 0.75, "rewards/chosen": -0.00027678441256284714, "rewards/margins": 0.01918334886431694, "rewards/rejected": -0.019460134208202362, "step": 27 }, { "debug/policy_chosen_logits": -1.1956357955932617, "debug/policy_chosen_logps": -122.40371704101562, "debug/policy_rejected_logits": -0.9366591572761536, "debug/policy_rejected_logps": -168.29766845703125, "debug/reference_chosen_logps": -127.96609497070312, "debug/reference_rejected_logps": -166.14305114746094, "epoch": 0.4375, "grad_norm": 5.594609963042074, "learning_rate": 1e-06, "logits/chosen": -1.1956357955932617, "logits/rejected": -0.9366591572761536, "logps/chosen": -122.40371704101562, "logps/rejected": -168.29766845703125, "loss": 0.4683, "rewards/accuracies": 0.625, "rewards/chosen": 0.05562380701303482, "rewards/margins": 0.07716996222734451, "rewards/rejected": -0.021546155214309692, "step": 28 }, { "debug/policy_chosen_logits": -0.8461283445358276, "debug/policy_chosen_logps": -166.58126831054688, "debug/policy_rejected_logits": -0.854517936706543, "debug/policy_rejected_logps": -160.43394470214844, "debug/reference_chosen_logps": -167.19557189941406, "debug/reference_rejected_logps": -156.71856689453125, "epoch": 0.453125, "grad_norm": 5.913333766169721, "learning_rate": 1e-06, "logits/chosen": -0.8461283445358276, "logits/rejected": -0.854517936706543, "logps/chosen": -166.58126831054688, "logps/rejected": -160.43394470214844, "loss": 0.4895, "rewards/accuracies": 0.625, "rewards/chosen": 0.006143084727227688, "rewards/margins": 0.04329695552587509, "rewards/rejected": -0.03715386986732483, "step": 29 }, { "debug/policy_chosen_logits": -0.9404685497283936, "debug/policy_chosen_logps": -169.5876922607422, "debug/policy_rejected_logits": -0.7541916966438293, "debug/policy_rejected_logps": -193.78857421875, "debug/reference_chosen_logps": -177.3969268798828, "debug/reference_rejected_logps": -190.34381103515625, "epoch": 0.46875, "grad_norm": 5.307805768669272, "learning_rate": 1e-06, "logits/chosen": -0.9404685497283936, "logits/rejected": -0.7541916966438293, "logps/chosen": -169.5876922607422, "logps/rejected": -193.78857421875, "loss": 0.4504, "rewards/accuracies": 0.875, "rewards/chosen": 0.07809236645698547, "rewards/margins": 0.11254002153873444, "rewards/rejected": -0.034447651356458664, "step": 30 }, { "debug/policy_chosen_logits": -0.9712099432945251, "debug/policy_chosen_logps": -148.6423797607422, "debug/policy_rejected_logits": -0.7749654650688171, "debug/policy_rejected_logps": -186.98977661132812, "debug/reference_chosen_logps": -149.51290893554688, "debug/reference_rejected_logps": -178.90377807617188, "epoch": 0.484375, "grad_norm": 6.072069194174168, "learning_rate": 1e-06, "logits/chosen": -0.9712099432945251, "logits/rejected": -0.7749654650688171, "logps/chosen": -148.6423797607422, "logps/rejected": -186.98977661132812, "loss": 0.4551, "rewards/accuracies": 0.875, "rewards/chosen": 0.008705183863639832, "rewards/margins": 0.08956518769264221, "rewards/rejected": -0.08086000382900238, "step": 31 }, { "debug/policy_chosen_logits": -0.7779644131660461, "debug/policy_chosen_logps": -181.0400390625, "debug/policy_rejected_logits": -0.6311701536178589, "debug/policy_rejected_logps": -197.38998413085938, "debug/reference_chosen_logps": -182.41529846191406, "debug/reference_rejected_logps": -193.42015075683594, "epoch": 0.5, "grad_norm": 5.6393792651737495, "learning_rate": 1e-06, "logits/chosen": -0.7779644131660461, "logits/rejected": -0.6311701536178589, "logps/chosen": -181.0400390625, "logps/rejected": -197.38998413085938, "loss": 0.4724, "rewards/accuracies": 0.75, "rewards/chosen": 0.013752726837992668, "rewards/margins": 0.05345122888684273, "rewards/rejected": -0.03969850391149521, "step": 32 }, { "debug/policy_chosen_logits": -0.9899529218673706, "debug/policy_chosen_logps": -140.02195739746094, "debug/policy_rejected_logits": -0.805591344833374, "debug/policy_rejected_logps": -171.04238891601562, "debug/reference_chosen_logps": -142.90939331054688, "debug/reference_rejected_logps": -164.79971313476562, "epoch": 0.515625, "grad_norm": 6.65298054287674, "learning_rate": 1e-06, "logits/chosen": -0.9899529218673706, "logits/rejected": -0.805591344833374, "logps/chosen": -140.02195739746094, "logps/rejected": -171.04238891601562, "loss": 0.4449, "rewards/accuracies": 0.625, "rewards/chosen": 0.028874464333057404, "rewards/margins": 0.09130106121301651, "rewards/rejected": -0.06242658942937851, "step": 33 }, { "debug/policy_chosen_logits": -0.8487688302993774, "debug/policy_chosen_logps": -150.60659790039062, "debug/policy_rejected_logits": -0.7607054710388184, "debug/policy_rejected_logps": -206.38253784179688, "debug/reference_chosen_logps": -152.25167846679688, "debug/reference_rejected_logps": -200.65679931640625, "epoch": 0.53125, "grad_norm": 6.500705317924212, "learning_rate": 1e-06, "logits/chosen": -0.8487688302993774, "logits/rejected": -0.7607054710388184, "logps/chosen": -150.60659790039062, "logps/rejected": -206.38253784179688, "loss": 0.4744, "rewards/accuracies": 0.5, "rewards/chosen": 0.01645086146891117, "rewards/margins": 0.07370824366807938, "rewards/rejected": -0.057257384061813354, "step": 34 }, { "debug/policy_chosen_logits": -0.7768784165382385, "debug/policy_chosen_logps": -152.84442138671875, "debug/policy_rejected_logits": -0.8615243434906006, "debug/policy_rejected_logps": -178.1951904296875, "debug/reference_chosen_logps": -156.7847900390625, "debug/reference_rejected_logps": -175.36306762695312, "epoch": 0.546875, "grad_norm": 6.2342972742178615, "learning_rate": 1e-06, "logits/chosen": -0.7768784165382385, "logits/rejected": -0.8615243434906006, "logps/chosen": -152.84442138671875, "logps/rejected": -178.1951904296875, "loss": 0.4689, "rewards/accuracies": 0.625, "rewards/chosen": 0.03940363600850105, "rewards/margins": 0.06772496551275253, "rewards/rejected": -0.02832133322954178, "step": 35 }, { "debug/policy_chosen_logits": -1.0841803550720215, "debug/policy_chosen_logps": -132.16390991210938, "debug/policy_rejected_logits": -0.8767221570014954, "debug/policy_rejected_logps": -177.02395629882812, "debug/reference_chosen_logps": -141.05548095703125, "debug/reference_rejected_logps": -180.10845947265625, "epoch": 0.5625, "grad_norm": 6.118058488612106, "learning_rate": 1e-06, "logits/chosen": -1.0841803550720215, "logits/rejected": -0.8767221570014954, "logps/chosen": -132.16390991210938, "logps/rejected": -177.02395629882812, "loss": 0.4562, "rewards/accuracies": 0.75, "rewards/chosen": 0.08891567587852478, "rewards/margins": 0.05807068198919296, "rewards/rejected": 0.030844993889331818, "step": 36 }, { "debug/policy_chosen_logits": -0.8277906179428101, "debug/policy_chosen_logps": -160.42657470703125, "debug/policy_rejected_logits": -0.9817237257957458, "debug/policy_rejected_logps": -175.60308837890625, "debug/reference_chosen_logps": -165.00286865234375, "debug/reference_rejected_logps": -175.93499755859375, "epoch": 0.578125, "grad_norm": 8.456361293746621, "learning_rate": 1e-06, "logits/chosen": -0.8277906179428101, "logits/rejected": -0.9817237257957458, "logps/chosen": -160.42657470703125, "logps/rejected": -175.60308837890625, "loss": 0.4951, "rewards/accuracies": 0.5, "rewards/chosen": 0.0457630455493927, "rewards/margins": 0.042444050312042236, "rewards/rejected": 0.003318987786769867, "step": 37 }, { "debug/policy_chosen_logits": -0.7899513244628906, "debug/policy_chosen_logps": -169.28268432617188, "debug/policy_rejected_logits": -0.8575571179389954, "debug/policy_rejected_logps": -171.0587921142578, "debug/reference_chosen_logps": -172.85174560546875, "debug/reference_rejected_logps": -171.73057556152344, "epoch": 0.59375, "grad_norm": 7.396146328689038, "learning_rate": 1e-06, "logits/chosen": -0.7899513244628906, "logits/rejected": -0.8575571179389954, "logps/chosen": -169.28268432617188, "logps/rejected": -171.0587921142578, "loss": 0.4858, "rewards/accuracies": 0.625, "rewards/chosen": 0.0356905460357666, "rewards/margins": 0.028972698375582695, "rewards/rejected": 0.006717845797538757, "step": 38 }, { "debug/policy_chosen_logits": -0.9996007680892944, "debug/policy_chosen_logps": -147.13336181640625, "debug/policy_rejected_logits": -0.7932425737380981, "debug/policy_rejected_logps": -184.53997802734375, "debug/reference_chosen_logps": -149.77064514160156, "debug/reference_rejected_logps": -183.35354614257812, "epoch": 0.609375, "grad_norm": 5.981284123668312, "learning_rate": 1e-06, "logits/chosen": -0.9996007680892944, "logits/rejected": -0.7932425737380981, "logps/chosen": -147.13336181640625, "logps/rejected": -184.53997802734375, "loss": 0.4553, "rewards/accuracies": 0.75, "rewards/chosen": 0.02637273073196411, "rewards/margins": 0.03823715075850487, "rewards/rejected": -0.011864423751831055, "step": 39 }, { "debug/policy_chosen_logits": -0.7280028462409973, "debug/policy_chosen_logps": -176.14102172851562, "debug/policy_rejected_logits": -0.6034502387046814, "debug/policy_rejected_logps": -213.32142639160156, "debug/reference_chosen_logps": -169.86448669433594, "debug/reference_rejected_logps": -208.47547912597656, "epoch": 0.625, "grad_norm": 8.108956880586513, "learning_rate": 1e-06, "logits/chosen": -0.7280028462409973, "logits/rejected": -0.6034502387046814, "logps/chosen": -176.14102172851562, "logps/rejected": -213.32142639160156, "loss": 0.4908, "rewards/accuracies": 0.5, "rewards/chosen": -0.06276529282331467, "rewards/margins": -0.014305687509477139, "rewards/rejected": -0.048459604382514954, "step": 40 }, { "debug/policy_chosen_logits": -1.0434032678604126, "debug/policy_chosen_logps": -155.97015380859375, "debug/policy_rejected_logits": -0.8284803032875061, "debug/policy_rejected_logps": -191.99710083007812, "debug/reference_chosen_logps": -157.0880584716797, "debug/reference_rejected_logps": -182.62484741210938, "epoch": 0.640625, "grad_norm": 7.874523792109847, "learning_rate": 1e-06, "logits/chosen": -1.0434032678604126, "logits/rejected": -0.8284803032875061, "logps/chosen": -155.97015380859375, "logps/rejected": -191.99710083007812, "loss": 0.4365, "rewards/accuracies": 0.75, "rewards/chosen": 0.011179141700267792, "rewards/margins": 0.10490170121192932, "rewards/rejected": -0.09372256696224213, "step": 41 }, { "debug/policy_chosen_logits": -0.588237464427948, "debug/policy_chosen_logps": -166.4864959716797, "debug/policy_rejected_logits": -0.7331523299217224, "debug/policy_rejected_logps": -181.28952026367188, "debug/reference_chosen_logps": -166.72225952148438, "debug/reference_rejected_logps": -174.8398895263672, "epoch": 0.65625, "grad_norm": 6.8274046949424045, "learning_rate": 1e-06, "logits/chosen": -0.588237464427948, "logits/rejected": -0.7331523299217224, "logps/chosen": -166.4864959716797, "logps/rejected": -181.28952026367188, "loss": 0.4523, "rewards/accuracies": 0.625, "rewards/chosen": 0.00235767406411469, "rewards/margins": 0.06685390323400497, "rewards/rejected": -0.0644962340593338, "step": 42 }, { "debug/policy_chosen_logits": -0.8734938502311707, "debug/policy_chosen_logps": -129.81631469726562, "debug/policy_rejected_logits": -0.879021942615509, "debug/policy_rejected_logps": -151.5695037841797, "debug/reference_chosen_logps": -132.09263610839844, "debug/reference_rejected_logps": -146.77114868164062, "epoch": 0.671875, "grad_norm": 6.18097446051221, "learning_rate": 1e-06, "logits/chosen": -0.8734938502311707, "logits/rejected": -0.879021942615509, "logps/chosen": -129.81631469726562, "logps/rejected": -151.5695037841797, "loss": 0.4407, "rewards/accuracies": 0.625, "rewards/chosen": 0.022763298824429512, "rewards/margins": 0.07074688374996185, "rewards/rejected": -0.04798358678817749, "step": 43 }, { "debug/policy_chosen_logits": -0.90626060962677, "debug/policy_chosen_logps": -139.66921997070312, "debug/policy_rejected_logits": -0.8777171969413757, "debug/policy_rejected_logps": -187.69277954101562, "debug/reference_chosen_logps": -145.80397033691406, "debug/reference_rejected_logps": -181.13963317871094, "epoch": 0.6875, "grad_norm": 7.207738479298261, "learning_rate": 1e-06, "logits/chosen": -0.90626060962677, "logits/rejected": -0.8777171969413757, "logps/chosen": -139.66921997070312, "logps/rejected": -187.69277954101562, "loss": 0.4527, "rewards/accuracies": 0.875, "rewards/chosen": 0.06134761869907379, "rewards/margins": 0.1268792748451233, "rewards/rejected": -0.0655316635966301, "step": 44 }, { "debug/policy_chosen_logits": -0.7800077199935913, "debug/policy_chosen_logps": -153.02247619628906, "debug/policy_rejected_logits": -0.8804305791854858, "debug/policy_rejected_logps": -175.3987274169922, "debug/reference_chosen_logps": -160.71377563476562, "debug/reference_rejected_logps": -164.52713012695312, "epoch": 0.703125, "grad_norm": 6.76588354529725, "learning_rate": 1e-06, "logits/chosen": -0.7800077199935913, "logits/rejected": -0.8804305791854858, "logps/chosen": -153.02247619628906, "logps/rejected": -175.3987274169922, "loss": 0.4337, "rewards/accuracies": 0.75, "rewards/chosen": 0.07691291719675064, "rewards/margins": 0.18562886118888855, "rewards/rejected": -0.1087159514427185, "step": 45 }, { "debug/policy_chosen_logits": -0.6891363859176636, "debug/policy_chosen_logps": -174.951416015625, "debug/policy_rejected_logits": -0.7365065217018127, "debug/policy_rejected_logps": -210.58712768554688, "debug/reference_chosen_logps": -179.22348022460938, "debug/reference_rejected_logps": -203.9578399658203, "epoch": 0.71875, "grad_norm": 6.33536574920728, "learning_rate": 1e-06, "logits/chosen": -0.6891363859176636, "logits/rejected": -0.7365065217018127, "logps/chosen": -174.951416015625, "logps/rejected": -210.58712768554688, "loss": 0.4355, "rewards/accuracies": 0.625, "rewards/chosen": 0.04272085428237915, "rewards/margins": 0.109013631939888, "rewards/rejected": -0.06629277765750885, "step": 46 }, { "debug/policy_chosen_logits": -1.0616976022720337, "debug/policy_chosen_logps": -140.75953674316406, "debug/policy_rejected_logits": -0.8193640112876892, "debug/policy_rejected_logps": -166.1212158203125, "debug/reference_chosen_logps": -145.49392700195312, "debug/reference_rejected_logps": -169.64987182617188, "epoch": 0.734375, "grad_norm": 7.072020108503254, "learning_rate": 1e-06, "logits/chosen": -1.0616976022720337, "logits/rejected": -0.8193640112876892, "logps/chosen": -140.75953674316406, "logps/rejected": -166.1212158203125, "loss": 0.4804, "rewards/accuracies": 0.5, "rewards/chosen": 0.04734383523464203, "rewards/margins": 0.012057198211550713, "rewards/rejected": 0.03528663516044617, "step": 47 }, { "debug/policy_chosen_logits": -0.8684768080711365, "debug/policy_chosen_logps": -168.0657958984375, "debug/policy_rejected_logits": -0.9659979939460754, "debug/policy_rejected_logps": -164.24932861328125, "debug/reference_chosen_logps": -175.15855407714844, "debug/reference_rejected_logps": -156.83241271972656, "epoch": 0.75, "grad_norm": 6.146768918478943, "learning_rate": 1e-06, "logits/chosen": -0.8684768080711365, "logits/rejected": -0.9659979939460754, "logps/chosen": -168.0657958984375, "logps/rejected": -164.24932861328125, "loss": 0.4465, "rewards/accuracies": 0.625, "rewards/chosen": 0.0709274560213089, "rewards/margins": 0.14509651064872742, "rewards/rejected": -0.07416905462741852, "step": 48 }, { "debug/policy_chosen_logits": -0.7563624382019043, "debug/policy_chosen_logps": -178.76950073242188, "debug/policy_rejected_logits": -0.639800488948822, "debug/policy_rejected_logps": -168.65484619140625, "debug/reference_chosen_logps": -178.57791137695312, "debug/reference_rejected_logps": -171.87460327148438, "epoch": 0.765625, "grad_norm": 5.930093840031159, "learning_rate": 1e-06, "logits/chosen": -0.7563624382019043, "logits/rejected": -0.639800488948822, "logps/chosen": -178.76950073242188, "logps/rejected": -168.65484619140625, "loss": 0.4697, "rewards/accuracies": 0.375, "rewards/chosen": -0.001915864646434784, "rewards/margins": -0.034113530069589615, "rewards/rejected": 0.03219766542315483, "step": 49 }, { "debug/policy_chosen_logits": -0.7761915922164917, "debug/policy_chosen_logps": -167.15562438964844, "debug/policy_rejected_logits": -0.8122952580451965, "debug/policy_rejected_logps": -167.90469360351562, "debug/reference_chosen_logps": -169.98074340820312, "debug/reference_rejected_logps": -171.24453735351562, "epoch": 0.78125, "grad_norm": 6.402012147266644, "learning_rate": 1e-06, "logits/chosen": -0.7761915922164917, "logits/rejected": -0.8122952580451965, "logps/chosen": -167.15562438964844, "logps/rejected": -167.90469360351562, "loss": 0.4469, "rewards/accuracies": 0.375, "rewards/chosen": 0.028251150622963905, "rewards/margins": -0.0051473043859004974, "rewards/rejected": 0.03339845687150955, "step": 50 }, { "debug/policy_chosen_logits": -1.0047523975372314, "debug/policy_chosen_logps": -161.81784057617188, "debug/policy_rejected_logits": -0.6831015944480896, "debug/policy_rejected_logps": -189.47030639648438, "debug/reference_chosen_logps": -159.06884765625, "debug/reference_rejected_logps": -185.1800537109375, "epoch": 0.796875, "grad_norm": 6.034796225510894, "learning_rate": 1e-06, "logits/chosen": -1.0047523975372314, "logits/rejected": -0.6831015944480896, "logps/chosen": -161.81784057617188, "logps/rejected": -189.47030639648438, "loss": 0.45, "rewards/accuracies": 0.5, "rewards/chosen": -0.027489986270666122, "rewards/margins": 0.015412424691021442, "rewards/rejected": -0.04290241375565529, "step": 51 }, { "debug/policy_chosen_logits": -0.9873104691505432, "debug/policy_chosen_logps": -147.3292236328125, "debug/policy_rejected_logits": -0.9074857234954834, "debug/policy_rejected_logps": -160.4976806640625, "debug/reference_chosen_logps": -150.1821746826172, "debug/reference_rejected_logps": -164.38450622558594, "epoch": 0.8125, "grad_norm": 5.995554347552576, "learning_rate": 1e-06, "logits/chosen": -0.9873104691505432, "logits/rejected": -0.9074857234954834, "logps/chosen": -147.3292236328125, "logps/rejected": -160.4976806640625, "loss": 0.4478, "rewards/accuracies": 0.375, "rewards/chosen": 0.028529377654194832, "rewards/margins": -0.010338811203837395, "rewards/rejected": 0.03886818885803223, "step": 52 }, { "debug/policy_chosen_logits": -0.8061836957931519, "debug/policy_chosen_logps": -154.01083374023438, "debug/policy_rejected_logits": -0.9387882947921753, "debug/policy_rejected_logps": -162.59048461914062, "debug/reference_chosen_logps": -165.92892456054688, "debug/reference_rejected_logps": -159.9312744140625, "epoch": 0.828125, "grad_norm": 6.573209394571419, "learning_rate": 1e-06, "logits/chosen": -0.8061836957931519, "logits/rejected": -0.9387882947921753, "logps/chosen": -154.01083374023438, "logps/rejected": -162.59048461914062, "loss": 0.4394, "rewards/accuracies": 0.875, "rewards/chosen": 0.11918096244335175, "rewards/margins": 0.14577314257621765, "rewards/rejected": -0.02659217081964016, "step": 53 }, { "debug/policy_chosen_logits": -0.8860620260238647, "debug/policy_chosen_logps": -171.57965087890625, "debug/policy_rejected_logits": -0.8173072934150696, "debug/policy_rejected_logps": -172.11398315429688, "debug/reference_chosen_logps": -175.11090087890625, "debug/reference_rejected_logps": -177.931396484375, "epoch": 0.84375, "grad_norm": 6.3343088344275875, "learning_rate": 1e-06, "logits/chosen": -0.8860620260238647, "logits/rejected": -0.8173072934150696, "logps/chosen": -171.57965087890625, "logps/rejected": -172.11398315429688, "loss": 0.4577, "rewards/accuracies": 0.75, "rewards/chosen": 0.03531248867511749, "rewards/margins": -0.02286178432404995, "rewards/rejected": 0.05817427486181259, "step": 54 }, { "debug/policy_chosen_logits": -0.9360355734825134, "debug/policy_chosen_logps": -159.60911560058594, "debug/policy_rejected_logits": -0.8322389125823975, "debug/policy_rejected_logps": -184.9361572265625, "debug/reference_chosen_logps": -161.784912109375, "debug/reference_rejected_logps": -181.3787841796875, "epoch": 0.859375, "grad_norm": 6.940576793005815, "learning_rate": 1e-06, "logits/chosen": -0.9360355734825134, "logits/rejected": -0.8322389125823975, "logps/chosen": -159.60911560058594, "logps/rejected": -184.9361572265625, "loss": 0.4589, "rewards/accuracies": 0.375, "rewards/chosen": 0.02175775356590748, "rewards/margins": 0.05733121931552887, "rewards/rejected": -0.03557346388697624, "step": 55 }, { "debug/policy_chosen_logits": -0.7893481850624084, "debug/policy_chosen_logps": -171.56735229492188, "debug/policy_rejected_logits": -0.7332755327224731, "debug/policy_rejected_logps": -196.43365478515625, "debug/reference_chosen_logps": -171.05889892578125, "debug/reference_rejected_logps": -182.0035400390625, "epoch": 0.875, "grad_norm": 6.971775270738982, "learning_rate": 1e-06, "logits/chosen": -0.7893481850624084, "logits/rejected": -0.7332755327224731, "logps/chosen": -171.56735229492188, "logps/rejected": -196.43365478515625, "loss": 0.4405, "rewards/accuracies": 0.875, "rewards/chosen": -0.00508442847058177, "rewards/margins": 0.13921663165092468, "rewards/rejected": -0.14430105686187744, "step": 56 }, { "debug/policy_chosen_logits": -1.2259751558303833, "debug/policy_chosen_logps": -143.96656799316406, "debug/policy_rejected_logits": -0.825638473033905, "debug/policy_rejected_logps": -187.6162567138672, "debug/reference_chosen_logps": -142.93975830078125, "debug/reference_rejected_logps": -183.60317993164062, "epoch": 0.890625, "grad_norm": 7.861463588461739, "learning_rate": 1e-06, "logits/chosen": -1.2259751558303833, "logits/rejected": -0.825638473033905, "logps/chosen": -143.96656799316406, "logps/rejected": -187.6162567138672, "loss": 0.4669, "rewards/accuracies": 0.625, "rewards/chosen": -0.010268086567521095, "rewards/margins": 0.029862696304917336, "rewards/rejected": -0.04013078659772873, "step": 57 }, { "debug/policy_chosen_logits": -0.6283476948738098, "debug/policy_chosen_logps": -179.42198181152344, "debug/policy_rejected_logits": -0.7192294001579285, "debug/policy_rejected_logps": -160.84140014648438, "debug/reference_chosen_logps": -179.8939666748047, "debug/reference_rejected_logps": -157.6416778564453, "epoch": 0.90625, "grad_norm": 5.979721246123928, "learning_rate": 1e-06, "logits/chosen": -0.6283476948738098, "logits/rejected": -0.7192294001579285, "logps/chosen": -179.42198181152344, "logps/rejected": -160.84140014648438, "loss": 0.4573, "rewards/accuracies": 0.5, "rewards/chosen": 0.004719886463135481, "rewards/margins": 0.03671710193157196, "rewards/rejected": -0.03199721872806549, "step": 58 }, { "debug/policy_chosen_logits": -1.0611419677734375, "debug/policy_chosen_logps": -162.58331298828125, "debug/policy_rejected_logits": -0.9486455917358398, "debug/policy_rejected_logps": -164.91607666015625, "debug/reference_chosen_logps": -168.58851623535156, "debug/reference_rejected_logps": -160.13070678710938, "epoch": 0.921875, "grad_norm": 6.255365355958018, "learning_rate": 1e-06, "logits/chosen": -1.0611419677734375, "logits/rejected": -0.9486455917358398, "logps/chosen": -162.58331298828125, "logps/rejected": -164.91607666015625, "loss": 0.4572, "rewards/accuracies": 0.75, "rewards/chosen": 0.06005190685391426, "rewards/margins": 0.10790553689002991, "rewards/rejected": -0.04785362631082535, "step": 59 }, { "debug/policy_chosen_logits": -0.8236188292503357, "debug/policy_chosen_logps": -162.90415954589844, "debug/policy_rejected_logits": -0.8510185480117798, "debug/policy_rejected_logps": -187.468994140625, "debug/reference_chosen_logps": -171.4542236328125, "debug/reference_rejected_logps": -171.10787963867188, "epoch": 0.9375, "grad_norm": 7.036142220314558, "learning_rate": 1e-06, "logits/chosen": -0.8236188292503357, "logits/rejected": -0.8510185480117798, "logps/chosen": -162.90415954589844, "logps/rejected": -187.468994140625, "loss": 0.4524, "rewards/accuracies": 0.75, "rewards/chosen": 0.08550070226192474, "rewards/margins": 0.24911174178123474, "rewards/rejected": -0.1636110544204712, "step": 60 }, { "debug/policy_chosen_logits": -0.9428707361221313, "debug/policy_chosen_logps": -148.73089599609375, "debug/policy_rejected_logits": -0.8415032029151917, "debug/policy_rejected_logps": -179.90115356445312, "debug/reference_chosen_logps": -155.75228881835938, "debug/reference_rejected_logps": -173.25796508789062, "epoch": 0.953125, "grad_norm": 7.382144233072753, "learning_rate": 1e-06, "logits/chosen": -0.9428707361221313, "logits/rejected": -0.8415032029151917, "logps/chosen": -148.73089599609375, "logps/rejected": -179.90115356445312, "loss": 0.4308, "rewards/accuracies": 0.75, "rewards/chosen": 0.07021382451057434, "rewards/margins": 0.1366458535194397, "rewards/rejected": -0.06643202900886536, "step": 61 }, { "debug/policy_chosen_logits": -0.7142590880393982, "debug/policy_chosen_logps": -183.39328002929688, "debug/policy_rejected_logits": -0.8318662643432617, "debug/policy_rejected_logps": -185.28347778320312, "debug/reference_chosen_logps": -185.95765686035156, "debug/reference_rejected_logps": -188.14785766601562, "epoch": 0.96875, "grad_norm": 7.772884707461087, "learning_rate": 1e-06, "logits/chosen": -0.7142590880393982, "logits/rejected": -0.8318662643432617, "logps/chosen": -183.39328002929688, "logps/rejected": -185.28347778320312, "loss": 0.4731, "rewards/accuracies": 0.5, "rewards/chosen": 0.025643818080425262, "rewards/margins": -0.0030001532286405563, "rewards/rejected": 0.02864396944642067, "step": 62 }, { "debug/policy_chosen_logits": -0.8611500859260559, "debug/policy_chosen_logps": -148.0809326171875, "debug/policy_rejected_logits": -0.9426791667938232, "debug/policy_rejected_logps": -159.23069763183594, "debug/reference_chosen_logps": -153.9005126953125, "debug/reference_rejected_logps": -147.47735595703125, "epoch": 0.984375, "grad_norm": 6.5837351291708375, "learning_rate": 1e-06, "logits/chosen": -0.8611500859260559, "logits/rejected": -0.9426791667938232, "logps/chosen": -148.0809326171875, "logps/rejected": -159.23069763183594, "loss": 0.4355, "rewards/accuracies": 1.0, "rewards/chosen": 0.058195747435092926, "rewards/margins": 0.1757291853427887, "rewards/rejected": -0.11753343790769577, "step": 63 }, { "debug/policy_chosen_logits": -0.8686034083366394, "debug/policy_chosen_logps": -163.016357421875, "debug/policy_rejected_logits": -0.8933126926422119, "debug/policy_rejected_logps": -178.46234130859375, "debug/reference_chosen_logps": -166.87405395507812, "debug/reference_rejected_logps": -176.6748046875, "epoch": 1.0, "grad_norm": 8.108425274874563, "learning_rate": 1e-06, "logits/chosen": -0.8686034083366394, "logits/rejected": -0.8933126926422119, "logps/chosen": -163.016357421875, "logps/rejected": -178.46234130859375, "loss": 0.4087, "rewards/accuracies": 0.5, "rewards/chosen": 0.03857698291540146, "rewards/margins": 0.05645231530070305, "rewards/rejected": -0.017875326797366142, "step": 64 }, { "epoch": 1.0, "step": 64, "total_flos": 0.0, "train_loss": 0.46988651156425476, "train_runtime": 197.5097, "train_samples_per_second": 20.627, "train_steps_per_second": 0.324 } ], "logging_steps": 1, "max_steps": 64, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }