{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 48, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -1.8909083604812622, "debug/policy_chosen_logps": -212.1804656982422, "debug/policy_rejected_logits": -1.8466837406158447, "debug/policy_rejected_logps": -221.20199584960938, "debug/reference_chosen_logps": -212.1804656982422, "debug/reference_rejected_logps": -221.20199584960938, "epoch": 0.020833333333333332, "grad_norm": 4.55949998556895, "learning_rate": 1e-06, "logits/chosen": -1.8909083604812622, "logits/rejected": -1.8466837406158447, "logps/chosen": -212.1804656982422, "logps/rejected": -221.20199584960938, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -1.9734070301055908, "debug/policy_chosen_logps": -230.3494873046875, "debug/policy_rejected_logits": -1.894794225692749, "debug/policy_rejected_logps": -232.613037109375, "debug/reference_chosen_logps": -230.78518676757812, "debug/reference_rejected_logps": -232.9464111328125, "epoch": 0.041666666666666664, "grad_norm": 5.028939569239188, "learning_rate": 1e-06, "logits/chosen": -1.9734070301055908, "logits/rejected": -1.894794225692749, "logps/chosen": -230.3494873046875, "logps/rejected": -232.613037109375, "loss": 0.5001, "rewards/accuracies": 0.5, "rewards/chosen": 0.004357052035629749, "rewards/margins": 0.0010233304928988218, "rewards/rejected": 0.00333372107706964, "step": 2 }, { "debug/policy_chosen_logits": -1.796450138092041, "debug/policy_chosen_logps": -239.1052703857422, "debug/policy_rejected_logits": -1.9075357913970947, "debug/policy_rejected_logps": -226.32025146484375, "debug/reference_chosen_logps": -238.9079132080078, "debug/reference_rejected_logps": -225.81983947753906, "epoch": 0.0625, "grad_norm": 5.159311600496707, "learning_rate": 1e-06, "logits/chosen": -1.796450138092041, "logits/rejected": -1.9075357913970947, "logps/chosen": -239.1052703857422, "logps/rejected": -226.32025146484375, "loss": 0.4992, "rewards/accuracies": 0.625, "rewards/chosen": -0.0019736099056899548, "rewards/margins": 0.0030304910615086555, "rewards/rejected": -0.005004100501537323, "step": 3 }, { "debug/policy_chosen_logits": -1.8530775308609009, "debug/policy_chosen_logps": -224.97164916992188, "debug/policy_rejected_logits": -1.868254542350769, "debug/policy_rejected_logps": -193.08578491210938, "debug/reference_chosen_logps": -225.149658203125, "debug/reference_rejected_logps": -193.382568359375, "epoch": 0.08333333333333333, "grad_norm": 5.291981074658711, "learning_rate": 1e-06, "logits/chosen": -1.8530775308609009, "logits/rejected": -1.868254542350769, "logps/chosen": -224.97164916992188, "logps/rejected": -193.08578491210938, "loss": 0.5002, "rewards/accuracies": 0.375, "rewards/chosen": 0.0017800141358748078, "rewards/margins": -0.0011878202203661203, "rewards/rejected": 0.00296783447265625, "step": 4 }, { "debug/policy_chosen_logits": -2.037583827972412, "debug/policy_chosen_logps": -218.92698669433594, "debug/policy_rejected_logits": -1.9175313711166382, "debug/policy_rejected_logps": -226.9515838623047, "debug/reference_chosen_logps": -218.6461181640625, "debug/reference_rejected_logps": -226.9630126953125, "epoch": 0.10416666666666667, "grad_norm": 4.634912199648336, "learning_rate": 1e-06, "logits/chosen": -2.037583827972412, "logits/rejected": -1.9175313711166382, "logps/chosen": -218.92698669433594, "logps/rejected": -226.9515838623047, "loss": 0.5003, "rewards/accuracies": 0.375, "rewards/chosen": -0.0028084944933652878, "rewards/margins": -0.002922859275713563, "rewards/rejected": 0.00011436472414061427, "step": 5 }, { "debug/policy_chosen_logits": -1.9980738162994385, "debug/policy_chosen_logps": -229.96798706054688, "debug/policy_rejected_logits": -2.106717586517334, "debug/policy_rejected_logps": -228.21139526367188, "debug/reference_chosen_logps": -229.47543334960938, "debug/reference_rejected_logps": -227.33944702148438, "epoch": 0.125, "grad_norm": 4.883441388138732, "learning_rate": 1e-06, "logits/chosen": -1.9980738162994385, "logits/rejected": -2.106717586517334, "logps/chosen": -229.96798706054688, "logps/rejected": -228.21139526367188, "loss": 0.4985, "rewards/accuracies": 0.625, "rewards/chosen": -0.004925765562802553, "rewards/margins": 0.0037936782464385033, "rewards/rejected": -0.008719444274902344, "step": 6 }, { "debug/policy_chosen_logits": -1.831208348274231, "debug/policy_chosen_logps": -209.08120727539062, "debug/policy_rejected_logits": -1.866153597831726, "debug/policy_rejected_logps": -212.32781982421875, "debug/reference_chosen_logps": -209.07412719726562, "debug/reference_rejected_logps": -212.41383361816406, "epoch": 0.14583333333333334, "grad_norm": 4.5645610444456395, "learning_rate": 1e-06, "logits/chosen": -1.831208348274231, "logits/rejected": -1.866153597831726, "logps/chosen": -209.08120727539062, "logps/rejected": -212.32781982421875, "loss": 0.4994, "rewards/accuracies": 0.25, "rewards/chosen": -7.074361201375723e-05, "rewards/margins": -0.0009309577289968729, "rewards/rejected": 0.0008602142333984375, "step": 7 }, { "debug/policy_chosen_logits": -2.0019192695617676, "debug/policy_chosen_logps": -194.42999267578125, "debug/policy_rejected_logits": -1.971666932106018, "debug/policy_rejected_logps": -202.69728088378906, "debug/reference_chosen_logps": -194.51901245117188, "debug/reference_rejected_logps": -202.4325714111328, "epoch": 0.16666666666666666, "grad_norm": 4.5561230493648965, "learning_rate": 1e-06, "logits/chosen": -2.0019192695617676, "logits/rejected": -1.971666932106018, "logps/chosen": -194.42999267578125, "logps/rejected": -202.69728088378906, "loss": 0.4992, "rewards/accuracies": 0.75, "rewards/chosen": 0.0008901978144422174, "rewards/margins": 0.0035373116843402386, "rewards/rejected": -0.0026471137534826994, "step": 8 }, { "debug/policy_chosen_logits": -1.9558801651000977, "debug/policy_chosen_logps": -215.09481811523438, "debug/policy_rejected_logits": -1.905698537826538, "debug/policy_rejected_logps": -221.3304443359375, "debug/reference_chosen_logps": -215.267578125, "debug/reference_rejected_logps": -221.17218017578125, "epoch": 0.1875, "grad_norm": 4.561516825339906, "learning_rate": 1e-06, "logits/chosen": -1.9558801651000977, "logits/rejected": -1.905698537826538, "logps/chosen": -215.09481811523438, "logps/rejected": -221.3304443359375, "loss": 0.4993, "rewards/accuracies": 0.375, "rewards/chosen": 0.0017275046557188034, "rewards/margins": 0.0033102035522460938, "rewards/rejected": -0.0015826987801119685, "step": 9 }, { "debug/policy_chosen_logits": -2.034991979598999, "debug/policy_chosen_logps": -203.59085083007812, "debug/policy_rejected_logits": -2.0764079093933105, "debug/policy_rejected_logps": -204.25738525390625, "debug/reference_chosen_logps": -204.2655029296875, "debug/reference_rejected_logps": -204.46153259277344, "epoch": 0.20833333333333334, "grad_norm": 4.745773144075817, "learning_rate": 1e-06, "logits/chosen": -2.034991979598999, "logits/rejected": -2.0764079093933105, "logps/chosen": -203.59085083007812, "logps/rejected": -204.25738525390625, "loss": 0.4984, "rewards/accuracies": 0.75, "rewards/chosen": 0.006746310740709305, "rewards/margins": 0.004704780410975218, "rewards/rejected": 0.0020415307953953743, "step": 10 }, { "debug/policy_chosen_logits": -2.0038836002349854, "debug/policy_chosen_logps": -226.81753540039062, "debug/policy_rejected_logits": -1.8038012981414795, "debug/policy_rejected_logps": -226.14935302734375, "debug/reference_chosen_logps": -226.28543090820312, "debug/reference_rejected_logps": -225.2156524658203, "epoch": 0.22916666666666666, "grad_norm": 4.763690551977785, "learning_rate": 1e-06, "logits/chosen": -2.0038836002349854, "logits/rejected": -1.8038012981414795, "logps/chosen": -226.81753540039062, "logps/rejected": -226.14935302734375, "loss": 0.4984, "rewards/accuracies": 0.5, "rewards/chosen": -0.005321083124727011, "rewards/margins": 0.004015827551484108, "rewards/rejected": -0.009336910210549831, "step": 11 }, { "debug/policy_chosen_logits": -2.122854471206665, "debug/policy_chosen_logps": -204.9606170654297, "debug/policy_rejected_logits": -1.9377713203430176, "debug/policy_rejected_logps": -229.38241577148438, "debug/reference_chosen_logps": -204.8526153564453, "debug/reference_rejected_logps": -229.0899658203125, "epoch": 0.25, "grad_norm": 4.606159450019782, "learning_rate": 1e-06, "logits/chosen": -2.122854471206665, "logits/rejected": -1.9377713203430176, "logps/chosen": -204.9606170654297, "logps/rejected": -229.38241577148438, "loss": 0.5, "rewards/accuracies": 0.625, "rewards/chosen": -0.0010799788869917393, "rewards/margins": 0.0018445015884935856, "rewards/rejected": -0.002924480475485325, "step": 12 }, { "debug/policy_chosen_logits": -1.9475512504577637, "debug/policy_chosen_logps": -224.90866088867188, "debug/policy_rejected_logits": -1.981605887413025, "debug/policy_rejected_logps": -228.95736694335938, "debug/reference_chosen_logps": -225.68130493164062, "debug/reference_rejected_logps": -229.091796875, "epoch": 0.2708333333333333, "grad_norm": 5.058425556791009, "learning_rate": 1e-06, "logits/chosen": -1.9475512504577637, "logits/rejected": -1.981605887413025, "logps/chosen": -224.90866088867188, "logps/rejected": -228.95736694335938, "loss": 0.4975, "rewards/accuracies": 0.625, "rewards/chosen": 0.007726612035185099, "rewards/margins": 0.00638235080987215, "rewards/rejected": 0.0013442612253129482, "step": 13 }, { "debug/policy_chosen_logits": -1.999975323677063, "debug/policy_chosen_logps": -209.15884399414062, "debug/policy_rejected_logits": -1.9890738725662231, "debug/policy_rejected_logps": -207.29666137695312, "debug/reference_chosen_logps": -209.07809448242188, "debug/reference_rejected_logps": -207.4446563720703, "epoch": 0.2916666666666667, "grad_norm": 4.7138136584768215, "learning_rate": 1e-06, "logits/chosen": -1.999975323677063, "logits/rejected": -1.9890738725662231, "logps/chosen": -209.15884399414062, "logps/rejected": -207.29666137695312, "loss": 0.4986, "rewards/accuracies": 0.25, "rewards/chosen": -0.0008074190700426698, "rewards/margins": -0.002287311712279916, "rewards/rejected": 0.001479892642237246, "step": 14 }, { "debug/policy_chosen_logits": -1.8115614652633667, "debug/policy_chosen_logps": -241.42343139648438, "debug/policy_rejected_logits": -1.799264669418335, "debug/policy_rejected_logps": -215.36915588378906, "debug/reference_chosen_logps": -242.1216583251953, "debug/reference_rejected_logps": -215.05712890625, "epoch": 0.3125, "grad_norm": 4.724225993954909, "learning_rate": 1e-06, "logits/chosen": -1.8115614652633667, "logits/rejected": -1.799264669418335, "logps/chosen": -241.42343139648438, "logps/rejected": -215.36915588378906, "loss": 0.4959, "rewards/accuracies": 0.625, "rewards/chosen": 0.006982230581343174, "rewards/margins": 0.010102405212819576, "rewards/rejected": -0.0031201746314764023, "step": 15 }, { "debug/policy_chosen_logits": -1.9611539840698242, "debug/policy_chosen_logps": -240.25247192382812, "debug/policy_rejected_logits": -1.8923048973083496, "debug/policy_rejected_logps": -247.1781768798828, "debug/reference_chosen_logps": -241.02410888671875, "debug/reference_rejected_logps": -248.17608642578125, "epoch": 0.3333333333333333, "grad_norm": 4.859194726511771, "learning_rate": 1e-06, "logits/chosen": -1.9611539840698242, "logits/rejected": -1.8923048973083496, "logps/chosen": -240.25247192382812, "logps/rejected": -247.1781768798828, "loss": 0.5015, "rewards/accuracies": 0.375, "rewards/chosen": 0.0077165220864117146, "rewards/margins": -0.002262687310576439, "rewards/rejected": 0.00997920986264944, "step": 16 }, { "debug/policy_chosen_logits": -1.8419935703277588, "debug/policy_chosen_logps": -225.85903930664062, "debug/policy_rejected_logits": -1.8397587537765503, "debug/policy_rejected_logps": -222.993896484375, "debug/reference_chosen_logps": -225.80227661132812, "debug/reference_rejected_logps": -223.05838012695312, "epoch": 0.3541666666666667, "grad_norm": 5.459841357543238, "learning_rate": 1e-06, "logits/chosen": -1.8419935703277588, "logits/rejected": -1.8397587537765503, "logps/chosen": -225.85903930664062, "logps/rejected": -222.993896484375, "loss": 0.4973, "rewards/accuracies": 0.25, "rewards/chosen": -0.0005677794106304646, "rewards/margins": -0.001212749513797462, "rewards/rejected": 0.000644969753921032, "step": 17 }, { "debug/policy_chosen_logits": -2.0776429176330566, "debug/policy_chosen_logps": -198.11862182617188, "debug/policy_rejected_logits": -1.911252498626709, "debug/policy_rejected_logps": -210.159912109375, "debug/reference_chosen_logps": -198.45901489257812, "debug/reference_rejected_logps": -209.61679077148438, "epoch": 0.375, "grad_norm": 5.052192403611287, "learning_rate": 1e-06, "logits/chosen": -2.0776429176330566, "logits/rejected": -1.911252498626709, "logps/chosen": -198.11862182617188, "logps/rejected": -210.159912109375, "loss": 0.4972, "rewards/accuracies": 0.875, "rewards/chosen": 0.003403835231438279, "rewards/margins": 0.008835029788315296, "rewards/rejected": -0.005431194789707661, "step": 18 }, { "debug/policy_chosen_logits": -1.9367341995239258, "debug/policy_chosen_logps": -199.481689453125, "debug/policy_rejected_logits": -2.1376593112945557, "debug/policy_rejected_logps": -220.9750213623047, "debug/reference_chosen_logps": -198.0703582763672, "debug/reference_rejected_logps": -220.8565673828125, "epoch": 0.3958333333333333, "grad_norm": 4.498097056137636, "learning_rate": 1e-06, "logits/chosen": -1.9367341995239258, "logits/rejected": -2.1376593112945557, "logps/chosen": -199.481689453125, "logps/rejected": -220.9750213623047, "loss": 0.5032, "rewards/accuracies": 0.25, "rewards/chosen": -0.014113139361143112, "rewards/margins": -0.012928619049489498, "rewards/rejected": -0.0011845207773149014, "step": 19 }, { "debug/policy_chosen_logits": -1.8367891311645508, "debug/policy_chosen_logps": -212.05157470703125, "debug/policy_rejected_logits": -1.9387035369873047, "debug/policy_rejected_logps": -215.82733154296875, "debug/reference_chosen_logps": -210.82493591308594, "debug/reference_rejected_logps": -215.45745849609375, "epoch": 0.4166666666666667, "grad_norm": 5.189713253568091, "learning_rate": 1e-06, "logits/chosen": -1.8367891311645508, "logits/rejected": -1.9387035369873047, "logps/chosen": -212.05157470703125, "logps/rejected": -215.82733154296875, "loss": 0.4931, "rewards/accuracies": 0.25, "rewards/chosen": -0.012266368605196476, "rewards/margins": -0.008567637763917446, "rewards/rejected": -0.0036987303756177425, "step": 20 }, { "debug/policy_chosen_logits": -1.8324953317642212, "debug/policy_chosen_logps": -218.19522094726562, "debug/policy_rejected_logits": -1.8325927257537842, "debug/policy_rejected_logps": -196.0030975341797, "debug/reference_chosen_logps": -218.56109619140625, "debug/reference_rejected_logps": -196.50845336914062, "epoch": 0.4375, "grad_norm": 4.589785119981397, "learning_rate": 1e-06, "logits/chosen": -1.8324953317642212, "logits/rejected": -1.8325927257537842, "logps/chosen": -218.19522094726562, "logps/rejected": -196.0030975341797, "loss": 0.4983, "rewards/accuracies": 0.375, "rewards/chosen": 0.0036588667426258326, "rewards/margins": -0.0013946916442364454, "rewards/rejected": 0.005053558386862278, "step": 21 }, { "debug/policy_chosen_logits": -1.8342565298080444, "debug/policy_chosen_logps": -204.41470336914062, "debug/policy_rejected_logits": -1.7877732515335083, "debug/policy_rejected_logps": -209.70840454101562, "debug/reference_chosen_logps": -204.43324279785156, "debug/reference_rejected_logps": -209.82937622070312, "epoch": 0.4583333333333333, "grad_norm": 5.153849908641546, "learning_rate": 1e-06, "logits/chosen": -1.8342565298080444, "logits/rejected": -1.7877732515335083, "logps/chosen": -204.41470336914062, "logps/rejected": -209.70840454101562, "loss": 0.4949, "rewards/accuracies": 0.625, "rewards/chosen": 0.00018556579016149044, "rewards/margins": -0.001024017110466957, "rewards/rejected": 0.0012095831334590912, "step": 22 }, { "debug/policy_chosen_logits": -1.8171206712722778, "debug/policy_chosen_logps": -236.4231719970703, "debug/policy_rejected_logits": -1.7470709085464478, "debug/policy_rejected_logps": -222.30374145507812, "debug/reference_chosen_logps": -236.17730712890625, "debug/reference_rejected_logps": -221.56448364257812, "epoch": 0.4791666666666667, "grad_norm": 4.893815400071578, "learning_rate": 1e-06, "logits/chosen": -1.8171206712722778, "logits/rejected": -1.7470709085464478, "logps/chosen": -236.4231719970703, "logps/rejected": -222.30374145507812, "loss": 0.4916, "rewards/accuracies": 0.5, "rewards/chosen": -0.00245870603248477, "rewards/margins": 0.004933852702379227, "rewards/rejected": -0.007392558269202709, "step": 23 }, { "debug/policy_chosen_logits": -1.6100187301635742, "debug/policy_chosen_logps": -212.08888244628906, "debug/policy_rejected_logits": -1.6458077430725098, "debug/policy_rejected_logps": -224.05459594726562, "debug/reference_chosen_logps": -211.93698120117188, "debug/reference_rejected_logps": -224.69906616210938, "epoch": 0.5, "grad_norm": 4.528676055698226, "learning_rate": 1e-06, "logits/chosen": -1.6100187301635742, "logits/rejected": -1.6458077430725098, "logps/chosen": -212.08888244628906, "logps/rejected": -224.05459594726562, "loss": 0.4958, "rewards/accuracies": 0.375, "rewards/chosen": -0.0015190127305686474, "rewards/margins": -0.007963847368955612, "rewards/rejected": 0.006444835104048252, "step": 24 }, { "debug/policy_chosen_logits": -1.8963773250579834, "debug/policy_chosen_logps": -210.84144592285156, "debug/policy_rejected_logits": -1.88288414478302, "debug/policy_rejected_logps": -208.75941467285156, "debug/reference_chosen_logps": -210.99363708496094, "debug/reference_rejected_logps": -208.8118896484375, "epoch": 0.5208333333333334, "grad_norm": 4.253747413467878, "learning_rate": 1e-06, "logits/chosen": -1.8963773250579834, "logits/rejected": -1.88288414478302, "logps/chosen": -210.84144592285156, "logps/rejected": -208.75941467285156, "loss": 0.4986, "rewards/accuracies": 0.625, "rewards/chosen": 0.0015218928456306458, "rewards/margins": 0.0009972000261768699, "rewards/rejected": 0.0005246927030384541, "step": 25 }, { "debug/policy_chosen_logits": -1.7630257606506348, "debug/policy_chosen_logps": -226.61837768554688, "debug/policy_rejected_logits": -1.7200900316238403, "debug/policy_rejected_logps": -244.78564453125, "debug/reference_chosen_logps": -226.70632934570312, "debug/reference_rejected_logps": -244.5872802734375, "epoch": 0.5416666666666666, "grad_norm": 4.3073478263433875, "learning_rate": 1e-06, "logits/chosen": -1.7630257606506348, "logits/rejected": -1.7200900316238403, "logps/chosen": -226.61837768554688, "logps/rejected": -244.78564453125, "loss": 0.4977, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008793829474598169, "rewards/margins": 0.002862987108528614, "rewards/rejected": -0.001983604161068797, "step": 26 }, { "debug/policy_chosen_logits": -1.8077607154846191, "debug/policy_chosen_logps": -211.0703887939453, "debug/policy_rejected_logits": -1.873244047164917, "debug/policy_rejected_logps": -235.58494567871094, "debug/reference_chosen_logps": -212.21527099609375, "debug/reference_rejected_logps": -235.125732421875, "epoch": 0.5625, "grad_norm": 4.735343462350552, "learning_rate": 1e-06, "logits/chosen": -1.8077607154846191, "logits/rejected": -1.873244047164917, "logps/chosen": -211.0703887939453, "logps/rejected": -235.58494567871094, "loss": 0.4979, "rewards/accuracies": 0.75, "rewards/chosen": 0.011448878794908524, "rewards/margins": 0.01604101061820984, "rewards/rejected": -0.00459213275462389, "step": 27 }, { "debug/policy_chosen_logits": -1.9979571104049683, "debug/policy_chosen_logps": -209.10702514648438, "debug/policy_rejected_logits": -1.9711703062057495, "debug/policy_rejected_logps": -210.96670532226562, "debug/reference_chosen_logps": -209.4556427001953, "debug/reference_rejected_logps": -210.02169799804688, "epoch": 0.5833333333333334, "grad_norm": 4.6588487462454395, "learning_rate": 1e-06, "logits/chosen": -1.9979571104049683, "logits/rejected": -1.9711703062057495, "logps/chosen": -209.10702514648438, "logps/rejected": -210.96670532226562, "loss": 0.4941, "rewards/accuracies": 0.625, "rewards/chosen": 0.003486290108412504, "rewards/margins": 0.012936287559568882, "rewards/rejected": -0.00944999698549509, "step": 28 }, { "debug/policy_chosen_logits": -1.8544397354125977, "debug/policy_chosen_logps": -221.3443145751953, "debug/policy_rejected_logits": -1.9452507495880127, "debug/policy_rejected_logps": -210.4943084716797, "debug/reference_chosen_logps": -221.66323852539062, "debug/reference_rejected_logps": -209.6286163330078, "epoch": 0.6041666666666666, "grad_norm": 4.357025587882443, "learning_rate": 1e-06, "logits/chosen": -1.8544397354125977, "logits/rejected": -1.9452507495880127, "logps/chosen": -221.3443145751953, "logps/rejected": -210.4943084716797, "loss": 0.4991, "rewards/accuracies": 0.625, "rewards/chosen": 0.0031892964616417885, "rewards/margins": 0.011846140958368778, "rewards/rejected": -0.008656845428049564, "step": 29 }, { "debug/policy_chosen_logits": -1.8338147401809692, "debug/policy_chosen_logps": -198.18841552734375, "debug/policy_rejected_logits": -1.7472094297409058, "debug/policy_rejected_logps": -218.80093383789062, "debug/reference_chosen_logps": -198.03680419921875, "debug/reference_rejected_logps": -219.40707397460938, "epoch": 0.625, "grad_norm": 4.842664247824887, "learning_rate": 1e-06, "logits/chosen": -1.8338147401809692, "logits/rejected": -1.7472094297409058, "logps/chosen": -198.18841552734375, "logps/rejected": -218.80093383789062, "loss": 0.4952, "rewards/accuracies": 0.25, "rewards/chosen": -0.0015161894261837006, "rewards/margins": -0.007577533833682537, "rewards/rejected": 0.0060613444074988365, "step": 30 }, { "debug/policy_chosen_logits": -1.8182415962219238, "debug/policy_chosen_logps": -206.81643676757812, "debug/policy_rejected_logits": -1.8333243131637573, "debug/policy_rejected_logps": -210.55618286132812, "debug/reference_chosen_logps": -205.9772186279297, "debug/reference_rejected_logps": -209.88775634765625, "epoch": 0.6458333333333334, "grad_norm": 4.918727552016112, "learning_rate": 1e-06, "logits/chosen": -1.8182415962219238, "logits/rejected": -1.8333243131637573, "logps/chosen": -206.81643676757812, "logps/rejected": -210.55618286132812, "loss": 0.4881, "rewards/accuracies": 0.625, "rewards/chosen": -0.008392143063247204, "rewards/margins": -0.0017078209202736616, "rewards/rejected": -0.0066843219101428986, "step": 31 }, { "debug/policy_chosen_logits": -1.8787778615951538, "debug/policy_chosen_logps": -206.3116455078125, "debug/policy_rejected_logits": -1.8249777555465698, "debug/policy_rejected_logps": -215.7191162109375, "debug/reference_chosen_logps": -206.0584259033203, "debug/reference_rejected_logps": -215.76406860351562, "epoch": 0.6666666666666666, "grad_norm": 4.531359648788432, "learning_rate": 1e-06, "logits/chosen": -1.8787778615951538, "logits/rejected": -1.8249777555465698, "logps/chosen": -206.3116455078125, "logps/rejected": -215.7191162109375, "loss": 0.4943, "rewards/accuracies": 0.5, "rewards/chosen": -0.002532253274694085, "rewards/margins": -0.0029817770700901747, "rewards/rejected": 0.0004495240282267332, "step": 32 }, { "debug/policy_chosen_logits": -1.9307353496551514, "debug/policy_chosen_logps": -210.0487823486328, "debug/policy_rejected_logits": -1.7820299863815308, "debug/policy_rejected_logps": -219.03224182128906, "debug/reference_chosen_logps": -209.83810424804688, "debug/reference_rejected_logps": -218.5866241455078, "epoch": 0.6875, "grad_norm": 4.504314802671299, "learning_rate": 1e-06, "logits/chosen": -1.9307353496551514, "logits/rejected": -1.7820299863815308, "logps/chosen": -210.0487823486328, "logps/rejected": -219.03224182128906, "loss": 0.4976, "rewards/accuracies": 0.5, "rewards/chosen": -0.002106723375618458, "rewards/margins": 0.0023493007756769657, "rewards/rejected": -0.004456023685634136, "step": 33 }, { "debug/policy_chosen_logits": -1.823900580406189, "debug/policy_chosen_logps": -199.96127319335938, "debug/policy_rejected_logits": -1.7855464220046997, "debug/policy_rejected_logps": -215.19229125976562, "debug/reference_chosen_logps": -199.58285522460938, "debug/reference_rejected_logps": -214.38479614257812, "epoch": 0.7083333333333334, "grad_norm": 4.457022242154747, "learning_rate": 1e-06, "logits/chosen": -1.823900580406189, "logits/rejected": -1.7855464220046997, "logps/chosen": -199.96127319335938, "logps/rejected": -215.19229125976562, "loss": 0.4971, "rewards/accuracies": 0.625, "rewards/chosen": -0.003784370142966509, "rewards/margins": 0.004290657117962837, "rewards/rejected": -0.008075027726590633, "step": 34 }, { "debug/policy_chosen_logits": -1.882658839225769, "debug/policy_chosen_logps": -209.23643493652344, "debug/policy_rejected_logits": -1.8747351169586182, "debug/policy_rejected_logps": -215.92868041992188, "debug/reference_chosen_logps": -210.23593139648438, "debug/reference_rejected_logps": -217.186279296875, "epoch": 0.7291666666666666, "grad_norm": 4.626965309640125, "learning_rate": 1e-06, "logits/chosen": -1.882658839225769, "logits/rejected": -1.8747351169586182, "logps/chosen": -209.23643493652344, "logps/rejected": -215.92868041992188, "loss": 0.4994, "rewards/accuracies": 0.5, "rewards/chosen": 0.009994887746870518, "rewards/margins": -0.0025810815859586, "rewards/rejected": 0.012575969099998474, "step": 35 }, { "debug/policy_chosen_logits": -1.8005609512329102, "debug/policy_chosen_logps": -225.9169158935547, "debug/policy_rejected_logits": -1.883096694946289, "debug/policy_rejected_logps": -234.09506225585938, "debug/reference_chosen_logps": -227.3267822265625, "debug/reference_rejected_logps": -234.53903198242188, "epoch": 0.75, "grad_norm": 4.3931691354690825, "learning_rate": 1e-06, "logits/chosen": -1.8005609512329102, "logits/rejected": -1.883096694946289, "logps/chosen": -225.9169158935547, "logps/rejected": -234.09506225585938, "loss": 0.491, "rewards/accuracies": 0.25, "rewards/chosen": 0.014098738320171833, "rewards/margins": 0.009659002535045147, "rewards/rejected": 0.004439735319465399, "step": 36 }, { "debug/policy_chosen_logits": -1.7069189548492432, "debug/policy_chosen_logps": -228.86813354492188, "debug/policy_rejected_logits": -1.6827794313430786, "debug/policy_rejected_logps": -214.09796142578125, "debug/reference_chosen_logps": -227.4351806640625, "debug/reference_rejected_logps": -213.31690979003906, "epoch": 0.7708333333333334, "grad_norm": 4.654373119163593, "learning_rate": 1e-06, "logits/chosen": -1.7069189548492432, "logits/rejected": -1.6827794313430786, "logps/chosen": -228.86813354492188, "logps/rejected": -214.09796142578125, "loss": 0.4902, "rewards/accuracies": 0.25, "rewards/chosen": -0.014329585246741772, "rewards/margins": -0.00651891715824604, "rewards/rejected": -0.007810668554157019, "step": 37 }, { "debug/policy_chosen_logits": -1.840372085571289, "debug/policy_chosen_logps": -230.10861206054688, "debug/policy_rejected_logits": -1.7545675039291382, "debug/policy_rejected_logps": -260.3162841796875, "debug/reference_chosen_logps": -230.72247314453125, "debug/reference_rejected_logps": -257.7795715332031, "epoch": 0.7916666666666666, "grad_norm": 4.825975520263496, "learning_rate": 1e-06, "logits/chosen": -1.840372085571289, "logits/rejected": -1.7545675039291382, "logps/chosen": -230.10861206054688, "logps/rejected": -260.3162841796875, "loss": 0.4896, "rewards/accuracies": 0.875, "rewards/chosen": 0.0061386870220303535, "rewards/margins": 0.03150550648570061, "rewards/rejected": -0.02536682039499283, "step": 38 }, { "debug/policy_chosen_logits": -1.920078992843628, "debug/policy_chosen_logps": -223.3726348876953, "debug/policy_rejected_logits": -1.828634262084961, "debug/policy_rejected_logps": -215.68679809570312, "debug/reference_chosen_logps": -225.49134826660156, "debug/reference_rejected_logps": -216.0248260498047, "epoch": 0.8125, "grad_norm": 4.550119735116361, "learning_rate": 1e-06, "logits/chosen": -1.920078992843628, "logits/rejected": -1.828634262084961, "logps/chosen": -223.3726348876953, "logps/rejected": -215.68679809570312, "loss": 0.4972, "rewards/accuracies": 0.625, "rewards/chosen": 0.0211871899664402, "rewards/margins": 0.01780683360993862, "rewards/rejected": 0.0033803561236709356, "step": 39 }, { "debug/policy_chosen_logits": -1.9407376050949097, "debug/policy_chosen_logps": -215.61834716796875, "debug/policy_rejected_logits": -1.7760881185531616, "debug/policy_rejected_logps": -239.66421508789062, "debug/reference_chosen_logps": -216.16978454589844, "debug/reference_rejected_logps": -234.6765594482422, "epoch": 0.8333333333333334, "grad_norm": 4.585659415126642, "learning_rate": 1e-06, "logits/chosen": -1.9407376050949097, "logits/rejected": -1.7760881185531616, "logps/chosen": -215.61834716796875, "logps/rejected": -239.66421508789062, "loss": 0.4894, "rewards/accuracies": 0.75, "rewards/chosen": 0.0055142780765891075, "rewards/margins": 0.055390775203704834, "rewards/rejected": -0.04987649619579315, "step": 40 }, { "debug/policy_chosen_logits": -1.8515745401382446, "debug/policy_chosen_logps": -230.47845458984375, "debug/policy_rejected_logits": -1.8017698526382446, "debug/policy_rejected_logps": -220.0799560546875, "debug/reference_chosen_logps": -231.57496643066406, "debug/reference_rejected_logps": -220.48355102539062, "epoch": 0.8541666666666666, "grad_norm": 4.376154994824471, "learning_rate": 1e-06, "logits/chosen": -1.8515745401382446, "logits/rejected": -1.8017698526382446, "logps/chosen": -230.47845458984375, "logps/rejected": -220.0799560546875, "loss": 0.4876, "rewards/accuracies": 0.625, "rewards/chosen": 0.010965080000460148, "rewards/margins": 0.006929206661880016, "rewards/rejected": 0.0040358733385801315, "step": 41 }, { "debug/policy_chosen_logits": -2.073969602584839, "debug/policy_chosen_logps": -197.85983276367188, "debug/policy_rejected_logits": -1.9918216466903687, "debug/policy_rejected_logps": -218.21873474121094, "debug/reference_chosen_logps": -199.05039978027344, "debug/reference_rejected_logps": -215.7158660888672, "epoch": 0.875, "grad_norm": 4.549372793147523, "learning_rate": 1e-06, "logits/chosen": -2.073969602584839, "logits/rejected": -1.9918216466903687, "logps/chosen": -197.85983276367188, "logps/rejected": -218.21873474121094, "loss": 0.4785, "rewards/accuracies": 1.0, "rewards/chosen": 0.011905612424015999, "rewards/margins": 0.036934297531843185, "rewards/rejected": -0.025028685107827187, "step": 42 }, { "debug/policy_chosen_logits": -1.8891502618789673, "debug/policy_chosen_logps": -212.4984588623047, "debug/policy_rejected_logits": -1.7939965724945068, "debug/policy_rejected_logps": -205.75616455078125, "debug/reference_chosen_logps": -214.7135772705078, "debug/reference_rejected_logps": -203.42559814453125, "epoch": 0.8958333333333334, "grad_norm": 4.709082311105444, "learning_rate": 1e-06, "logits/chosen": -1.8891502618789673, "logits/rejected": -1.7939965724945068, "logps/chosen": -212.4984588623047, "logps/rejected": -205.75616455078125, "loss": 0.4908, "rewards/accuracies": 0.875, "rewards/chosen": 0.022151164710521698, "rewards/margins": 0.04545694589614868, "rewards/rejected": -0.023305777460336685, "step": 43 }, { "debug/policy_chosen_logits": -1.713548183441162, "debug/policy_chosen_logps": -215.9876708984375, "debug/policy_rejected_logits": -1.6541578769683838, "debug/policy_rejected_logps": -211.833984375, "debug/reference_chosen_logps": -216.26913452148438, "debug/reference_rejected_logps": -211.96865844726562, "epoch": 0.9166666666666666, "grad_norm": 4.592980550179187, "learning_rate": 1e-06, "logits/chosen": -1.713548183441162, "logits/rejected": -1.6541578769683838, "logps/chosen": -215.9876708984375, "logps/rejected": -211.833984375, "loss": 0.4974, "rewards/accuracies": 0.5, "rewards/chosen": 0.0028144647367298603, "rewards/margins": 0.00146764749661088, "rewards/rejected": 0.0013468170072883368, "step": 44 }, { "debug/policy_chosen_logits": -1.9016904830932617, "debug/policy_chosen_logps": -224.2040557861328, "debug/policy_rejected_logits": -2.0025641918182373, "debug/policy_rejected_logps": -205.37611389160156, "debug/reference_chosen_logps": -224.86119079589844, "debug/reference_rejected_logps": -206.6584014892578, "epoch": 0.9375, "grad_norm": 4.466979083240116, "learning_rate": 1e-06, "logits/chosen": -1.9016904830932617, "logits/rejected": -2.0025641918182373, "logps/chosen": -224.2040557861328, "logps/rejected": -205.37611389160156, "loss": 0.4873, "rewards/accuracies": 0.5, "rewards/chosen": 0.006571331061422825, "rewards/margins": -0.006251506507396698, "rewards/rejected": 0.012822837568819523, "step": 45 }, { "debug/policy_chosen_logits": -1.9147862195968628, "debug/policy_chosen_logps": -214.40841674804688, "debug/policy_rejected_logits": -1.7642629146575928, "debug/policy_rejected_logps": -226.85809326171875, "debug/reference_chosen_logps": -216.05572509765625, "debug/reference_rejected_logps": -224.88555908203125, "epoch": 0.9583333333333334, "grad_norm": 4.657365424136277, "learning_rate": 1e-06, "logits/chosen": -1.9147862195968628, "logits/rejected": -1.7642629146575928, "logps/chosen": -214.40841674804688, "logps/rejected": -226.85809326171875, "loss": 0.4846, "rewards/accuracies": 0.75, "rewards/chosen": 0.016473084688186646, "rewards/margins": 0.03619840741157532, "rewards/rejected": -0.019725322723388672, "step": 46 }, { "debug/policy_chosen_logits": -1.9227019548416138, "debug/policy_chosen_logps": -232.88064575195312, "debug/policy_rejected_logits": -1.82795250415802, "debug/policy_rejected_logps": -216.19277954101562, "debug/reference_chosen_logps": -235.4039764404297, "debug/reference_rejected_logps": -218.12701416015625, "epoch": 0.9791666666666666, "grad_norm": 4.981963754300823, "learning_rate": 1e-06, "logits/chosen": -1.9227019548416138, "logits/rejected": -1.82795250415802, "logps/chosen": -232.88064575195312, "logps/rejected": -216.19277954101562, "loss": 0.4908, "rewards/accuracies": 0.625, "rewards/chosen": 0.025233382359147072, "rewards/margins": 0.0058908844366669655, "rewards/rejected": 0.019342496991157532, "step": 47 }, { "debug/policy_chosen_logits": -1.723406434059143, "debug/policy_chosen_logps": -218.03060913085938, "debug/policy_rejected_logits": -1.8165022134780884, "debug/policy_rejected_logps": -218.10299682617188, "debug/reference_chosen_logps": -217.13656616210938, "debug/reference_rejected_logps": -216.82008361816406, "epoch": 1.0, "grad_norm": 4.726122116654148, "learning_rate": 1e-06, "logits/chosen": -1.723406434059143, "logits/rejected": -1.8165022134780884, "logps/chosen": -218.03060913085938, "logps/rejected": -218.10299682617188, "loss": 0.4855, "rewards/accuracies": 0.5, "rewards/chosen": -0.008940430358052254, "rewards/margins": 0.003888798877596855, "rewards/rejected": -0.01282922737300396, "step": 48 }, { "epoch": 1.0, "step": 48, "total_flos": 0.0, "train_loss": 0.4953512450059255, "train_runtime": 163.6148, "train_samples_per_second": 18.66, "train_steps_per_second": 0.293 } ], "logging_steps": 1, "max_steps": 48, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }