{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 43, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -1.4800337553024292, "debug/policy_chosen_logps": -227.59622192382812, "debug/policy_rejected_logits": -1.3822641372680664, "debug/policy_rejected_logps": -260.10986328125, "debug/reference_chosen_logps": -227.59622192382812, "debug/reference_rejected_logps": -260.10986328125, "epoch": 0.023255813953488372, "grad_norm": 25.286757766739537, "learning_rate": 1e-06, "logits/chosen": -1.4800337553024292, "logits/rejected": -1.3822641372680664, "logps/chosen": -227.59622192382812, "logps/rejected": -260.10986328125, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -1.4440364837646484, "debug/policy_chosen_logps": -259.37353515625, "debug/policy_rejected_logits": -1.4269922971725464, "debug/policy_rejected_logps": -346.24896240234375, "debug/reference_chosen_logps": -260.3768005371094, "debug/reference_rejected_logps": -346.227783203125, "epoch": 0.046511627906976744, "grad_norm": 31.24773632261976, "learning_rate": 1e-06, "logits/chosen": -1.4440364837646484, "logits/rejected": -1.4269922971725464, "logps/chosen": -259.37353515625, "logps/rejected": -346.24896240234375, "loss": 0.4934, "rewards/accuracies": 0.625, "rewards/chosen": 0.010032729245722294, "rewards/margins": 0.010244522243738174, "rewards/rejected": -0.0002117917174473405, "step": 2 }, { "debug/policy_chosen_logits": -1.6697512865066528, "debug/policy_chosen_logps": -243.97499084472656, "debug/policy_rejected_logits": -1.595983624458313, "debug/policy_rejected_logps": -305.7943420410156, "debug/reference_chosen_logps": -246.6484375, "debug/reference_rejected_logps": -309.16387939453125, "epoch": 0.06976744186046512, "grad_norm": 21.531609545676446, "learning_rate": 1e-06, "logits/chosen": -1.6697512865066528, "logits/rejected": -1.595983624458313, "logps/chosen": -243.97499084472656, "logps/rejected": -305.7943420410156, "loss": 0.4894, "rewards/accuracies": 0.5, "rewards/chosen": 0.026734504848718643, "rewards/margins": -0.006960991304367781, "rewards/rejected": 0.03369549661874771, "step": 3 }, { "debug/policy_chosen_logits": -1.5738418102264404, "debug/policy_chosen_logps": -220.88735961914062, "debug/policy_rejected_logits": -1.460009217262268, "debug/policy_rejected_logps": -338.9933776855469, "debug/reference_chosen_logps": -227.4576416015625, "debug/reference_rejected_logps": -344.28399658203125, "epoch": 0.09302325581395349, "grad_norm": 44.438009389978966, "learning_rate": 1e-06, "logits/chosen": -1.5738418102264404, "logits/rejected": -1.460009217262268, "logps/chosen": -220.88735961914062, "logps/rejected": -338.9933776855469, "loss": 0.5239, "rewards/accuracies": 0.375, "rewards/chosen": 0.06570270657539368, "rewards/margins": 0.012796590104699135, "rewards/rejected": 0.052906110882759094, "step": 4 }, { "debug/policy_chosen_logits": -1.5304296016693115, "debug/policy_chosen_logps": -243.91213989257812, "debug/policy_rejected_logits": -1.435927152633667, "debug/policy_rejected_logps": -243.02850341796875, "debug/reference_chosen_logps": -247.21408081054688, "debug/reference_rejected_logps": -247.84434509277344, "epoch": 0.11627906976744186, "grad_norm": 28.93471569409853, "learning_rate": 1e-06, "logits/chosen": -1.5304296016693115, "logits/rejected": -1.435927152633667, "logps/chosen": -243.91213989257812, "logps/rejected": -243.02850341796875, "loss": 0.507, "rewards/accuracies": 0.75, "rewards/chosen": 0.03301956132054329, "rewards/margins": -0.015139006078243256, "rewards/rejected": 0.04815857112407684, "step": 5 }, { "debug/policy_chosen_logits": -1.485123872756958, "debug/policy_chosen_logps": -229.44435119628906, "debug/policy_rejected_logits": -1.376402735710144, "debug/policy_rejected_logps": -262.6148376464844, "debug/reference_chosen_logps": -231.71469116210938, "debug/reference_rejected_logps": -260.9300842285156, "epoch": 0.13953488372093023, "grad_norm": 12.44043620690519, "learning_rate": 1e-06, "logits/chosen": -1.485123872756958, "logits/rejected": -1.376402735710144, "logps/chosen": -229.44435119628906, "logps/rejected": -262.6148376464844, "loss": 0.4907, "rewards/accuracies": 0.625, "rewards/chosen": 0.022703303024172783, "rewards/margins": 0.03955078125, "rewards/rejected": -0.016847476363182068, "step": 6 }, { "debug/policy_chosen_logits": -1.489279866218567, "debug/policy_chosen_logps": -197.94729614257812, "debug/policy_rejected_logits": -1.3406989574432373, "debug/policy_rejected_logps": -268.1605224609375, "debug/reference_chosen_logps": -196.8703155517578, "debug/reference_rejected_logps": -269.4376525878906, "epoch": 0.16279069767441862, "grad_norm": 10.958280340397616, "learning_rate": 1e-06, "logits/chosen": -1.489279866218567, "logits/rejected": -1.3406989574432373, "logps/chosen": -197.94729614257812, "logps/rejected": -268.1605224609375, "loss": 0.487, "rewards/accuracies": 0.375, "rewards/chosen": -0.010769689455628395, "rewards/margins": -0.02354125864803791, "rewards/rejected": 0.012771566398441792, "step": 7 }, { "debug/policy_chosen_logits": -1.3707197904586792, "debug/policy_chosen_logps": -237.71815490722656, "debug/policy_rejected_logits": -1.330206036567688, "debug/policy_rejected_logps": -239.78775024414062, "debug/reference_chosen_logps": -239.93931579589844, "debug/reference_rejected_logps": -240.64669799804688, "epoch": 0.18604651162790697, "grad_norm": 19.632633175850014, "learning_rate": 1e-06, "logits/chosen": -1.3707197904586792, "logits/rejected": -1.330206036567688, "logps/chosen": -237.71815490722656, "logps/rejected": -239.78775024414062, "loss": 0.4876, "rewards/accuracies": 0.5, "rewards/chosen": 0.022211609408259392, "rewards/margins": 0.013622130267322063, "rewards/rejected": 0.008589478209614754, "step": 8 }, { "debug/policy_chosen_logits": -1.7030099630355835, "debug/policy_chosen_logps": -199.52928161621094, "debug/policy_rejected_logits": -1.4712837934494019, "debug/policy_rejected_logps": -255.04507446289062, "debug/reference_chosen_logps": -195.96688842773438, "debug/reference_rejected_logps": -253.04783630371094, "epoch": 0.20930232558139536, "grad_norm": 15.51886442055276, "learning_rate": 1e-06, "logits/chosen": -1.7030099630355835, "logits/rejected": -1.4712837934494019, "logps/chosen": -199.52928161621094, "logps/rejected": -255.04507446289062, "loss": 0.5014, "rewards/accuracies": 0.375, "rewards/chosen": -0.03562391176819801, "rewards/margins": -0.015651512891054153, "rewards/rejected": -0.01997240073978901, "step": 9 }, { "debug/policy_chosen_logits": -1.520648717880249, "debug/policy_chosen_logps": -286.5086669921875, "debug/policy_rejected_logits": -1.5609104633331299, "debug/policy_rejected_logps": -271.101318359375, "debug/reference_chosen_logps": -280.365478515625, "debug/reference_rejected_logps": -266.813720703125, "epoch": 0.23255813953488372, "grad_norm": 13.373127593427753, "learning_rate": 1e-06, "logits/chosen": -1.520648717880249, "logits/rejected": -1.5609104633331299, "logps/chosen": -286.5086669921875, "logps/rejected": -271.101318359375, "loss": 0.4933, "rewards/accuracies": 0.25, "rewards/chosen": -0.06143181025981903, "rewards/margins": -0.018555622547864914, "rewards/rejected": -0.04287618398666382, "step": 10 }, { "debug/policy_chosen_logits": -1.617360234260559, "debug/policy_chosen_logps": -244.319580078125, "debug/policy_rejected_logits": -1.5032525062561035, "debug/policy_rejected_logps": -301.5211486816406, "debug/reference_chosen_logps": -238.87344360351562, "debug/reference_rejected_logps": -295.4085693359375, "epoch": 0.2558139534883721, "grad_norm": 12.552727674179597, "learning_rate": 1e-06, "logits/chosen": -1.617360234260559, "logits/rejected": -1.5032525062561035, "logps/chosen": -244.319580078125, "logps/rejected": -301.5211486816406, "loss": 0.5067, "rewards/accuracies": 0.625, "rewards/chosen": -0.054461196064949036, "rewards/margins": 0.00666469382122159, "rewards/rejected": -0.06112588942050934, "step": 11 }, { "debug/policy_chosen_logits": -1.4798977375030518, "debug/policy_chosen_logps": -253.769775390625, "debug/policy_rejected_logits": -1.4382582902908325, "debug/policy_rejected_logps": -334.34503173828125, "debug/reference_chosen_logps": -247.32626342773438, "debug/reference_rejected_logps": -325.88397216796875, "epoch": 0.27906976744186046, "grad_norm": 30.353653361523758, "learning_rate": 1e-06, "logits/chosen": -1.4798977375030518, "logits/rejected": -1.4382582902908325, "logps/chosen": -253.769775390625, "logps/rejected": -334.34503173828125, "loss": 0.501, "rewards/accuracies": 0.625, "rewards/chosen": -0.064435213804245, "rewards/margins": 0.02017536386847496, "rewards/rejected": -0.08461058139801025, "step": 12 }, { "debug/policy_chosen_logits": -1.626142144203186, "debug/policy_chosen_logps": -242.15365600585938, "debug/policy_rejected_logits": -1.5628588199615479, "debug/policy_rejected_logps": -276.919189453125, "debug/reference_chosen_logps": -234.9322967529297, "debug/reference_rejected_logps": -271.10296630859375, "epoch": 0.3023255813953488, "grad_norm": 29.145511000911654, "learning_rate": 1e-06, "logits/chosen": -1.626142144203186, "logits/rejected": -1.5628588199615479, "logps/chosen": -242.15365600585938, "logps/rejected": -276.919189453125, "loss": 0.4977, "rewards/accuracies": 0.5, "rewards/chosen": -0.07221347838640213, "rewards/margins": -0.014051439240574837, "rewards/rejected": -0.058162033557891846, "step": 13 }, { "debug/policy_chosen_logits": -1.494837999343872, "debug/policy_chosen_logps": -253.470947265625, "debug/policy_rejected_logits": -1.3477786779403687, "debug/policy_rejected_logps": -385.6488037109375, "debug/reference_chosen_logps": -248.0553741455078, "debug/reference_rejected_logps": -376.4827575683594, "epoch": 0.32558139534883723, "grad_norm": 20.5442036061972, "learning_rate": 1e-06, "logits/chosen": -1.494837999343872, "logits/rejected": -1.3477786779403687, "logps/chosen": -253.470947265625, "logps/rejected": -385.6488037109375, "loss": 0.4858, "rewards/accuracies": 0.875, "rewards/chosen": -0.05415596067905426, "rewards/margins": 0.03750423341989517, "rewards/rejected": -0.09166019409894943, "step": 14 }, { "debug/policy_chosen_logits": -1.4637919664382935, "debug/policy_chosen_logps": -228.59967041015625, "debug/policy_rejected_logits": -1.450419306755066, "debug/policy_rejected_logps": -283.78955078125, "debug/reference_chosen_logps": -222.84097290039062, "debug/reference_rejected_logps": -279.7840576171875, "epoch": 0.3488372093023256, "grad_norm": 16.220449203622703, "learning_rate": 1e-06, "logits/chosen": -1.4637919664382935, "logits/rejected": -1.450419306755066, "logps/chosen": -228.59967041015625, "logps/rejected": -283.78955078125, "loss": 0.4913, "rewards/accuracies": 0.5, "rewards/chosen": -0.05758703127503395, "rewards/margins": -0.017532480880618095, "rewards/rejected": -0.040054552257061005, "step": 15 }, { "debug/policy_chosen_logits": -1.490121841430664, "debug/policy_chosen_logps": -261.367431640625, "debug/policy_rejected_logits": -1.4834879636764526, "debug/policy_rejected_logps": -324.0771484375, "debug/reference_chosen_logps": -262.689697265625, "debug/reference_rejected_logps": -320.5054931640625, "epoch": 0.37209302325581395, "grad_norm": 12.5026857634808, "learning_rate": 1e-06, "logits/chosen": -1.490121841430664, "logits/rejected": -1.4834879636764526, "logps/chosen": -261.367431640625, "logps/rejected": -324.0771484375, "loss": 0.4941, "rewards/accuracies": 0.5, "rewards/chosen": 0.013222455978393555, "rewards/margins": 0.04893936961889267, "rewards/rejected": -0.035716913640499115, "step": 16 }, { "debug/policy_chosen_logits": -1.4635764360427856, "debug/policy_chosen_logps": -253.47198486328125, "debug/policy_rejected_logits": -1.4481921195983887, "debug/policy_rejected_logps": -267.24908447265625, "debug/reference_chosen_logps": -253.2239532470703, "debug/reference_rejected_logps": -265.45428466796875, "epoch": 0.3953488372093023, "grad_norm": 14.984254627556092, "learning_rate": 1e-06, "logits/chosen": -1.4635764360427856, "logits/rejected": -1.4481921195983887, "logps/chosen": -253.47198486328125, "logps/rejected": -267.24908447265625, "loss": 0.4716, "rewards/accuracies": 0.5, "rewards/chosen": -0.00248044915497303, "rewards/margins": 0.015467623248696327, "rewards/rejected": -0.017948072403669357, "step": 17 }, { "debug/policy_chosen_logits": -1.5003846883773804, "debug/policy_chosen_logps": -239.0650634765625, "debug/policy_rejected_logits": -1.4701699018478394, "debug/policy_rejected_logps": -291.6846618652344, "debug/reference_chosen_logps": -242.17843627929688, "debug/reference_rejected_logps": -290.8171691894531, "epoch": 0.4186046511627907, "grad_norm": 62.745975345402044, "learning_rate": 1e-06, "logits/chosen": -1.5003846883773804, "logits/rejected": -1.4701699018478394, "logps/chosen": -239.0650634765625, "logps/rejected": -291.6846618652344, "loss": 0.513, "rewards/accuracies": 0.75, "rewards/chosen": 0.03113384172320366, "rewards/margins": 0.039808571338653564, "rewards/rejected": -0.008674736134707928, "step": 18 }, { "debug/policy_chosen_logits": -1.5634993314743042, "debug/policy_chosen_logps": -282.7076721191406, "debug/policy_rejected_logits": -1.4637484550476074, "debug/policy_rejected_logps": -369.665771484375, "debug/reference_chosen_logps": -285.98809814453125, "debug/reference_rejected_logps": -365.2072448730469, "epoch": 0.4418604651162791, "grad_norm": 67.5680733312671, "learning_rate": 1e-06, "logits/chosen": -1.5634993314743042, "logits/rejected": -1.4637484550476074, "logps/chosen": -282.7076721191406, "logps/rejected": -369.665771484375, "loss": 0.5182, "rewards/accuracies": 0.75, "rewards/chosen": 0.032804299145936966, "rewards/margins": 0.07738937437534332, "rewards/rejected": -0.04458507522940636, "step": 19 }, { "debug/policy_chosen_logits": -1.5499062538146973, "debug/policy_chosen_logps": -243.81826782226562, "debug/policy_rejected_logits": -1.428619146347046, "debug/policy_rejected_logps": -278.8910217285156, "debug/reference_chosen_logps": -241.1705780029297, "debug/reference_rejected_logps": -282.3166809082031, "epoch": 0.46511627906976744, "grad_norm": 33.30284560459796, "learning_rate": 1e-06, "logits/chosen": -1.5499062538146973, "logits/rejected": -1.428619146347046, "logps/chosen": -243.81826782226562, "logps/rejected": -278.8910217285156, "loss": 0.5014, "rewards/accuracies": 0.25, "rewards/chosen": -0.026476802304387093, "rewards/margins": -0.06073341518640518, "rewards/rejected": 0.03425660729408264, "step": 20 }, { "debug/policy_chosen_logits": -1.5484369993209839, "debug/policy_chosen_logps": -236.13153076171875, "debug/policy_rejected_logits": -1.4573348760604858, "debug/policy_rejected_logps": -318.0836181640625, "debug/reference_chosen_logps": -238.38150024414062, "debug/reference_rejected_logps": -311.675537109375, "epoch": 0.4883720930232558, "grad_norm": 26.028206528344583, "learning_rate": 1e-06, "logits/chosen": -1.5484369993209839, "logits/rejected": -1.4573348760604858, "logps/chosen": -236.13153076171875, "logps/rejected": -318.0836181640625, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 0.022499503567814827, "rewards/margins": 0.08658017963171005, "rewards/rejected": -0.06408067792654037, "step": 21 }, { "debug/policy_chosen_logits": -1.477999210357666, "debug/policy_chosen_logps": -236.61976623535156, "debug/policy_rejected_logits": -1.5330206155776978, "debug/policy_rejected_logps": -254.75137329101562, "debug/reference_chosen_logps": -242.10296630859375, "debug/reference_rejected_logps": -256.4278564453125, "epoch": 0.5116279069767442, "grad_norm": 12.717740462220679, "learning_rate": 1e-06, "logits/chosen": -1.477999210357666, "logits/rejected": -1.5330206155776978, "logps/chosen": -236.61976623535156, "logps/rejected": -254.75137329101562, "loss": 0.4661, "rewards/accuracies": 0.625, "rewards/chosen": 0.054832056164741516, "rewards/margins": 0.038067229092121124, "rewards/rejected": 0.01676483266055584, "step": 22 }, { "debug/policy_chosen_logits": -1.5425748825073242, "debug/policy_chosen_logps": -253.24728393554688, "debug/policy_rejected_logits": -1.3970561027526855, "debug/policy_rejected_logps": -285.9176025390625, "debug/reference_chosen_logps": -256.5536804199219, "debug/reference_rejected_logps": -280.66290283203125, "epoch": 0.5348837209302325, "grad_norm": 14.114516755791191, "learning_rate": 1e-06, "logits/chosen": -1.5425748825073242, "logits/rejected": -1.3970561027526855, "logps/chosen": -253.24728393554688, "logps/rejected": -285.9176025390625, "loss": 0.4789, "rewards/accuracies": 0.875, "rewards/chosen": 0.03306387737393379, "rewards/margins": 0.08561088144779205, "rewards/rejected": -0.052546996623277664, "step": 23 }, { "debug/policy_chosen_logits": -1.4313610792160034, "debug/policy_chosen_logps": -257.1189880371094, "debug/policy_rejected_logits": -1.4023176431655884, "debug/policy_rejected_logps": -287.728759765625, "debug/reference_chosen_logps": -253.95172119140625, "debug/reference_rejected_logps": -281.169189453125, "epoch": 0.5581395348837209, "grad_norm": 16.70530366237368, "learning_rate": 1e-06, "logits/chosen": -1.4313610792160034, "logits/rejected": -1.4023176431655884, "logps/chosen": -257.1189880371094, "logps/rejected": -287.728759765625, "loss": 0.4995, "rewards/accuracies": 0.5, "rewards/chosen": -0.03167259320616722, "rewards/margins": 0.033923033624887466, "rewards/rejected": -0.06559562683105469, "step": 24 }, { "debug/policy_chosen_logits": -1.6776950359344482, "debug/policy_chosen_logps": -212.17718505859375, "debug/policy_rejected_logits": -1.6066213846206665, "debug/policy_rejected_logps": -319.0657653808594, "debug/reference_chosen_logps": -210.25326538085938, "debug/reference_rejected_logps": -314.89080810546875, "epoch": 0.5813953488372093, "grad_norm": 19.63229356498524, "learning_rate": 1e-06, "logits/chosen": -1.6776950359344482, "logits/rejected": -1.6066213846206665, "logps/chosen": -212.17718505859375, "logps/rejected": -319.0657653808594, "loss": 0.4833, "rewards/accuracies": 0.5, "rewards/chosen": -0.019239062443375587, "rewards/margins": 0.022510146722197533, "rewards/rejected": -0.04174920916557312, "step": 25 }, { "debug/policy_chosen_logits": -1.584831714630127, "debug/policy_chosen_logps": -273.2344055175781, "debug/policy_rejected_logits": -1.4396116733551025, "debug/policy_rejected_logps": -370.0079040527344, "debug/reference_chosen_logps": -266.8863220214844, "debug/reference_rejected_logps": -362.03753662109375, "epoch": 0.6046511627906976, "grad_norm": 80.99628039928015, "learning_rate": 1e-06, "logits/chosen": -1.584831714630127, "logits/rejected": -1.4396116733551025, "logps/chosen": -273.2344055175781, "logps/rejected": -370.0079040527344, "loss": 0.4921, "rewards/accuracies": 0.625, "rewards/chosen": -0.06348095089197159, "rewards/margins": 0.016222573816776276, "rewards/rejected": -0.07970351725816727, "step": 26 }, { "debug/policy_chosen_logits": -1.5102322101593018, "debug/policy_chosen_logps": -208.26466369628906, "debug/policy_rejected_logits": -1.5266185998916626, "debug/policy_rejected_logps": -328.9117736816406, "debug/reference_chosen_logps": -204.88662719726562, "debug/reference_rejected_logps": -320.92779541015625, "epoch": 0.627906976744186, "grad_norm": 31.627328387030346, "learning_rate": 1e-06, "logits/chosen": -1.5102322101593018, "logits/rejected": -1.5266185998916626, "logps/chosen": -208.26466369628906, "logps/rejected": -328.9117736816406, "loss": 0.48, "rewards/accuracies": 0.875, "rewards/chosen": -0.03378046676516533, "rewards/margins": 0.04605957865715027, "rewards/rejected": -0.0798400491476059, "step": 27 }, { "debug/policy_chosen_logits": -1.634057879447937, "debug/policy_chosen_logps": -241.25396728515625, "debug/policy_rejected_logits": -1.5432931184768677, "debug/policy_rejected_logps": -305.04144287109375, "debug/reference_chosen_logps": -237.45614624023438, "debug/reference_rejected_logps": -299.62860107421875, "epoch": 0.6511627906976745, "grad_norm": 20.965225897782357, "learning_rate": 1e-06, "logits/chosen": -1.634057879447937, "logits/rejected": -1.5432931184768677, "logps/chosen": -241.25396728515625, "logps/rejected": -305.04144287109375, "loss": 0.4787, "rewards/accuracies": 0.375, "rewards/chosen": -0.03797803819179535, "rewards/margins": 0.016150377690792084, "rewards/rejected": -0.05412841960787773, "step": 28 }, { "debug/policy_chosen_logits": -1.4655396938323975, "debug/policy_chosen_logps": -244.4412078857422, "debug/policy_rejected_logits": -1.3155853748321533, "debug/policy_rejected_logps": -346.4338073730469, "debug/reference_chosen_logps": -242.06707763671875, "debug/reference_rejected_logps": -339.31298828125, "epoch": 0.6744186046511628, "grad_norm": 41.53419555606317, "learning_rate": 1e-06, "logits/chosen": -1.4655396938323975, "logits/rejected": -1.3155853748321533, "logps/chosen": -244.4412078857422, "logps/rejected": -346.4338073730469, "loss": 0.4678, "rewards/accuracies": 0.5, "rewards/chosen": -0.023741263896226883, "rewards/margins": 0.04746692627668381, "rewards/rejected": -0.07120818644762039, "step": 29 }, { "debug/policy_chosen_logits": -1.376099705696106, "debug/policy_chosen_logps": -284.8990478515625, "debug/policy_rejected_logits": -1.464508295059204, "debug/policy_rejected_logps": -332.29876708984375, "debug/reference_chosen_logps": -279.30706787109375, "debug/reference_rejected_logps": -325.35540771484375, "epoch": 0.6976744186046512, "grad_norm": 28.2708663571601, "learning_rate": 1e-06, "logits/chosen": -1.376099705696106, "logits/rejected": -1.464508295059204, "logps/chosen": -284.8990478515625, "logps/rejected": -332.29876708984375, "loss": 0.4951, "rewards/accuracies": 0.75, "rewards/chosen": -0.05591966211795807, "rewards/margins": 0.013513736426830292, "rewards/rejected": -0.06943340599536896, "step": 30 }, { "debug/policy_chosen_logits": -1.6072348356246948, "debug/policy_chosen_logps": -303.22357177734375, "debug/policy_rejected_logits": -1.5747175216674805, "debug/policy_rejected_logps": -293.53021240234375, "debug/reference_chosen_logps": -308.82586669921875, "debug/reference_rejected_logps": -294.2177429199219, "epoch": 0.7209302325581395, "grad_norm": 21.3674681524243, "learning_rate": 1e-06, "logits/chosen": -1.6072348356246948, "logits/rejected": -1.5747175216674805, "logps/chosen": -303.22357177734375, "logps/rejected": -293.53021240234375, "loss": 0.4852, "rewards/accuracies": 0.5, "rewards/chosen": 0.056022755801677704, "rewards/margins": 0.04914722591638565, "rewards/rejected": 0.006875534541904926, "step": 31 }, { "debug/policy_chosen_logits": -1.6013528108596802, "debug/policy_chosen_logps": -201.08168029785156, "debug/policy_rejected_logits": -1.525626301765442, "debug/policy_rejected_logps": -285.8968505859375, "debug/reference_chosen_logps": -204.1475067138672, "debug/reference_rejected_logps": -287.4036865234375, "epoch": 0.7441860465116279, "grad_norm": 69.64848167898147, "learning_rate": 1e-06, "logits/chosen": -1.6013528108596802, "logits/rejected": -1.525626301765442, "logps/chosen": -201.08168029785156, "logps/rejected": -285.8968505859375, "loss": 0.4891, "rewards/accuracies": 0.625, "rewards/chosen": 0.030658282339572906, "rewards/margins": 0.01559007540345192, "rewards/rejected": 0.015068206936120987, "step": 32 }, { "debug/policy_chosen_logits": -1.451366662979126, "debug/policy_chosen_logps": -203.4061279296875, "debug/policy_rejected_logits": -1.3678964376449585, "debug/policy_rejected_logps": -318.1324462890625, "debug/reference_chosen_logps": -208.88272094726562, "debug/reference_rejected_logps": -317.94720458984375, "epoch": 0.7674418604651163, "grad_norm": 35.97418955885682, "learning_rate": 1e-06, "logits/chosen": -1.451366662979126, "logits/rejected": -1.3678964376449585, "logps/chosen": -203.4061279296875, "logps/rejected": -318.1324462890625, "loss": 0.472, "rewards/accuracies": 0.875, "rewards/chosen": 0.05476587265729904, "rewards/margins": 0.05661845579743385, "rewards/rejected": -0.0018525868654251099, "step": 33 }, { "debug/policy_chosen_logits": -1.4909645318984985, "debug/policy_chosen_logps": -225.0683135986328, "debug/policy_rejected_logits": -1.3721257448196411, "debug/policy_rejected_logps": -336.38055419921875, "debug/reference_chosen_logps": -228.97702026367188, "debug/reference_rejected_logps": -332.643798828125, "epoch": 0.7906976744186046, "grad_norm": 34.03019022306166, "learning_rate": 1e-06, "logits/chosen": -1.4909645318984985, "logits/rejected": -1.3721257448196411, "logps/chosen": -225.0683135986328, "logps/rejected": -336.38055419921875, "loss": 0.4836, "rewards/accuracies": 0.625, "rewards/chosen": 0.039086971431970596, "rewards/margins": 0.07645416259765625, "rewards/rejected": -0.037367187440395355, "step": 34 }, { "debug/policy_chosen_logits": -1.4662220478057861, "debug/policy_chosen_logps": -208.84197998046875, "debug/policy_rejected_logits": -1.3850266933441162, "debug/policy_rejected_logps": -313.42071533203125, "debug/reference_chosen_logps": -218.6650390625, "debug/reference_rejected_logps": -315.7051696777344, "epoch": 0.813953488372093, "grad_norm": 14.433858141028516, "learning_rate": 1e-06, "logits/chosen": -1.4662220478057861, "logits/rejected": -1.3850266933441162, "logps/chosen": -208.84197998046875, "logps/rejected": -313.42071533203125, "loss": 0.4699, "rewards/accuracies": 0.75, "rewards/chosen": 0.09823057055473328, "rewards/margins": 0.07538595795631409, "rewards/rejected": 0.022844618186354637, "step": 35 }, { "debug/policy_chosen_logits": -1.4087128639221191, "debug/policy_chosen_logps": -239.64378356933594, "debug/policy_rejected_logits": -1.4237617254257202, "debug/policy_rejected_logps": -290.43841552734375, "debug/reference_chosen_logps": -242.57217407226562, "debug/reference_rejected_logps": -287.6371154785156, "epoch": 0.8372093023255814, "grad_norm": 25.211278136613597, "learning_rate": 1e-06, "logits/chosen": -1.4087128639221191, "logits/rejected": -1.4237617254257202, "logps/chosen": -239.64378356933594, "logps/rejected": -290.43841552734375, "loss": 0.4783, "rewards/accuracies": 0.625, "rewards/chosen": 0.029283981770277023, "rewards/margins": 0.057296961545944214, "rewards/rejected": -0.02801297791302204, "step": 36 }, { "debug/policy_chosen_logits": -1.4810130596160889, "debug/policy_chosen_logps": -323.06396484375, "debug/policy_rejected_logits": -1.384968876838684, "debug/policy_rejected_logps": -310.9790344238281, "debug/reference_chosen_logps": -320.3404846191406, "debug/reference_rejected_logps": -307.171142578125, "epoch": 0.8604651162790697, "grad_norm": 14.965706611341272, "learning_rate": 1e-06, "logits/chosen": -1.4810130596160889, "logits/rejected": -1.384968876838684, "logps/chosen": -323.06396484375, "logps/rejected": -310.9790344238281, "loss": 0.4828, "rewards/accuracies": 0.5, "rewards/chosen": -0.027234820649027824, "rewards/margins": 0.010844306088984013, "rewards/rejected": -0.03807912766933441, "step": 37 }, { "debug/policy_chosen_logits": -1.494775652885437, "debug/policy_chosen_logps": -255.37355041503906, "debug/policy_rejected_logits": -1.2921141386032104, "debug/policy_rejected_logps": -303.7215270996094, "debug/reference_chosen_logps": -254.65061950683594, "debug/reference_rejected_logps": -298.89752197265625, "epoch": 0.8837209302325582, "grad_norm": 18.608451616837808, "learning_rate": 1e-06, "logits/chosen": -1.494775652885437, "logits/rejected": -1.2921141386032104, "logps/chosen": -255.37355041503906, "logps/rejected": -303.7215270996094, "loss": 0.4731, "rewards/accuracies": 0.625, "rewards/chosen": -0.007229272276163101, "rewards/margins": 0.041010741144418716, "rewards/rejected": -0.04824001342058182, "step": 38 }, { "debug/policy_chosen_logits": -1.5846779346466064, "debug/policy_chosen_logps": -245.59695434570312, "debug/policy_rejected_logits": -1.4889392852783203, "debug/policy_rejected_logps": -339.7038269042969, "debug/reference_chosen_logps": -243.83364868164062, "debug/reference_rejected_logps": -329.29339599609375, "epoch": 0.9069767441860465, "grad_norm": 19.463522713048608, "learning_rate": 1e-06, "logits/chosen": -1.5846779346466064, "logits/rejected": -1.4889392852783203, "logps/chosen": -245.59695434570312, "logps/rejected": -339.7038269042969, "loss": 0.4796, "rewards/accuracies": 0.875, "rewards/chosen": -0.01763315126299858, "rewards/margins": 0.08647122979164124, "rewards/rejected": -0.10410438477993011, "step": 39 }, { "debug/policy_chosen_logits": -1.6717126369476318, "debug/policy_chosen_logps": -252.74761962890625, "debug/policy_rejected_logits": -1.5717849731445312, "debug/policy_rejected_logps": -283.14227294921875, "debug/reference_chosen_logps": -250.6838836669922, "debug/reference_rejected_logps": -272.893310546875, "epoch": 0.9302325581395349, "grad_norm": 56.526734840161275, "learning_rate": 1e-06, "logits/chosen": -1.6717126369476318, "logits/rejected": -1.5717849731445312, "logps/chosen": -252.74761962890625, "logps/rejected": -283.14227294921875, "loss": 0.4867, "rewards/accuracies": 0.875, "rewards/chosen": -0.020637379959225655, "rewards/margins": 0.08185231685638428, "rewards/rejected": -0.10248969495296478, "step": 40 }, { "debug/policy_chosen_logits": -1.509047269821167, "debug/policy_chosen_logps": -236.76260375976562, "debug/policy_rejected_logits": -1.4229860305786133, "debug/policy_rejected_logps": -297.8006286621094, "debug/reference_chosen_logps": -231.5150604248047, "debug/reference_rejected_logps": -290.0295104980469, "epoch": 0.9534883720930233, "grad_norm": 48.26429130431874, "learning_rate": 1e-06, "logits/chosen": -1.509047269821167, "logits/rejected": -1.4229860305786133, "logps/chosen": -236.76260375976562, "logps/rejected": -297.8006286621094, "loss": 0.479, "rewards/accuracies": 0.5, "rewards/chosen": -0.052475374191999435, "rewards/margins": 0.025235844776034355, "rewards/rejected": -0.07771121710538864, "step": 41 }, { "debug/policy_chosen_logits": -1.4423142671585083, "debug/policy_chosen_logps": -221.72979736328125, "debug/policy_rejected_logits": -1.388156771659851, "debug/policy_rejected_logps": -291.08966064453125, "debug/reference_chosen_logps": -220.1903076171875, "debug/reference_rejected_logps": -282.77642822265625, "epoch": 0.9767441860465116, "grad_norm": 51.290551175577924, "learning_rate": 1e-06, "logits/chosen": -1.4423142671585083, "logits/rejected": -1.388156771659851, "logps/chosen": -221.72979736328125, "logps/rejected": -291.08966064453125, "loss": 0.4717, "rewards/accuracies": 0.75, "rewards/chosen": -0.015394706279039383, "rewards/margins": 0.06773784011602402, "rewards/rejected": -0.0831325501203537, "step": 42 }, { "debug/policy_chosen_logits": -1.5267871618270874, "debug/policy_chosen_logps": -317.9560241699219, "debug/policy_rejected_logits": -1.5894036293029785, "debug/policy_rejected_logps": -294.4762878417969, "debug/reference_chosen_logps": -310.841552734375, "debug/reference_rejected_logps": -288.2388916015625, "epoch": 1.0, "grad_norm": 17.024727279476426, "learning_rate": 1e-06, "logits/chosen": -1.5267871618270874, "logits/rejected": -1.5894036293029785, "logps/chosen": -317.9560241699219, "logps/rejected": -294.4762878417969, "loss": 0.4655, "rewards/accuracies": 0.625, "rewards/chosen": -0.07114488631486893, "rewards/margins": -0.008771037682890892, "rewards/rejected": -0.06237384304404259, "step": 43 }, { "epoch": 1.0, "step": 43, "total_flos": 0.0, "train_loss": 0.4878519225952237, "train_runtime": 150.4846, "train_samples_per_second": 18.095, "train_steps_per_second": 0.286 } ], "logging_steps": 1, "max_steps": 43, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }