{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.35546875, "learning_rate": 8.23045267489712e-09, "logits/chosen": 0.24564924836158752, "logits/rejected": 1.0062695741653442, "logps/chosen": -229.83255004882812, "logps/rejected": -164.65399169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.369140625, "learning_rate": 8.230452674897118e-08, "logits/chosen": -0.04918687045574188, "logits/rejected": 0.6123232245445251, "logps/chosen": -238.79006958007812, "logps/rejected": -207.5037841796875, "loss": 0.6931, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.00015826645540073514, "rewards/margins": 0.0006196785252541304, "rewards/margins_max": 0.002893384313210845, "rewards/margins_min": -0.0016540272627025843, "rewards/margins_std": 0.0032155057415366173, "rewards/rejected": -0.0004614120698533952, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.443359375, "learning_rate": 1.6460905349794237e-07, "logits/chosen": 0.04978996887803078, "logits/rejected": 0.601681649684906, "logps/chosen": -255.1076202392578, "logps/rejected": -220.27145385742188, "loss": 0.6931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 2.9706948225793894e-06, "rewards/margins": 0.00022058103058952838, "rewards/margins_max": 0.00360403535887599, "rewards/margins_min": -0.0031628732103854418, "rewards/margins_std": 0.004784926772117615, "rewards/rejected": -0.0002176103589590639, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.361328125, "learning_rate": 2.4691358024691354e-07, "logits/chosen": 0.0722523182630539, "logits/rejected": 0.5806540250778198, "logps/chosen": -242.0666046142578, "logps/rejected": -229.0381317138672, "loss": 0.693, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00025353097589686513, "rewards/margins": -0.0001711220684228465, "rewards/margins_max": 0.00201609218493104, "rewards/margins_min": -0.002358336467295885, "rewards/margins_std": 0.003093188162893057, "rewards/rejected": -8.240890747401863e-05, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.408203125, "learning_rate": 3.2921810699588474e-07, "logits/chosen": 0.0854184553027153, "logits/rejected": 0.6598686575889587, "logps/chosen": -272.9035339355469, "logps/rejected": -232.7262725830078, "loss": 0.6926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 8.360335777979344e-05, "rewards/margins": 0.001020856318064034, "rewards/margins_max": 0.003615677822381258, "rewards/margins_min": -0.0015739649534225464, "rewards/margins_std": 0.003669631900265813, "rewards/rejected": -0.0009372529457323253, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.40234375, "learning_rate": 4.11522633744856e-07, "logits/chosen": 0.03861381113529205, "logits/rejected": 0.42459020018577576, "logps/chosen": -248.6800537109375, "logps/rejected": -249.634033203125, "loss": 0.6923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0004395098949316889, "rewards/margins": 0.00210589449852705, "rewards/margins_max": 0.004740457516163588, "rewards/margins_min": -0.0005286684026941657, "rewards/margins_std": 0.003725834656506777, "rewards/rejected": -0.0016663845162838697, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.412109375, "learning_rate": 4.938271604938271e-07, "logits/chosen": 0.028602436184883118, "logits/rejected": 0.599826991558075, "logps/chosen": -243.0851287841797, "logps/rejected": -205.10818481445312, "loss": 0.6915, "rewards/accuracies": 0.75, "rewards/chosen": 0.0011325245723128319, "rewards/margins": 0.002941467333585024, "rewards/margins_max": 0.005611724685877562, "rewards/margins_min": 0.0002712096902541816, "rewards/margins_std": 0.003776314901188016, "rewards/rejected": -0.0018089428776875138, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.43359375, "learning_rate": 5.761316872427983e-07, "logits/chosen": 0.12637177109718323, "logits/rejected": 0.6491262912750244, "logps/chosen": -233.43142700195312, "logps/rejected": -179.89846801757812, "loss": 0.6918, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0002556543913669884, "rewards/margins": 0.0022196024656295776, "rewards/margins_max": 0.004624036606401205, "rewards/margins_min": -0.00018483158783055842, "rewards/margins_std": 0.0034003830514848232, "rewards/rejected": -0.001963948365300894, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.44140625, "learning_rate": 6.584362139917695e-07, "logits/chosen": -0.029578953981399536, "logits/rejected": 0.4067414402961731, "logps/chosen": -235.6042022705078, "logps/rejected": -224.54019165039062, "loss": 0.6906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0017460808157920837, "rewards/margins": 0.004769898019731045, "rewards/margins_max": 0.007818765938282013, "rewards/margins_min": 0.0017210301011800766, "rewards/margins_std": 0.004311750642955303, "rewards/rejected": -0.0030238174367696047, "step": 80 }, { "epoch": 0.04, "grad_norm": 0.4921875, "learning_rate": 7.407407407407406e-07, "logits/chosen": 0.2614774703979492, "logits/rejected": 0.6254442930221558, "logps/chosen": -205.9300994873047, "logps/rejected": -194.76925659179688, "loss": 0.6901, "rewards/accuracies": 0.875, "rewards/chosen": 0.002798306755721569, "rewards/margins": 0.006482880562543869, "rewards/margins_max": 0.01096098218113184, "rewards/margins_min": 0.002004781039431691, "rewards/margins_std": 0.0063329897820949554, "rewards/rejected": -0.0036845742724835873, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.63671875, "learning_rate": 8.23045267489712e-07, "logits/chosen": -0.02924344502389431, "logits/rejected": 0.43279844522476196, "logps/chosen": -237.76242065429688, "logps/rejected": -233.0888671875, "loss": 0.6891, "rewards/accuracies": 0.8125, "rewards/chosen": 0.002704631770029664, "rewards/margins": 0.008112462237477303, "rewards/margins_max": 0.013658873736858368, "rewards/margins_min": 0.0025660484097898006, "rewards/margins_std": 0.007843811996281147, "rewards/rejected": -0.0054078297689557076, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.5, "learning_rate": 9.053497942386831e-07, "logits/chosen": 0.05362590029835701, "logits/rejected": 0.6371204257011414, "logps/chosen": -253.3213348388672, "logps/rejected": -201.6106414794922, "loss": 0.6881, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0040958598256111145, "rewards/margins": 0.010091823525726795, "rewards/margins_max": 0.015223483555018902, "rewards/margins_min": 0.004960163962095976, "rewards/margins_std": 0.007257262710481882, "rewards/rejected": -0.005995963700115681, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.375, "learning_rate": 9.876543209876542e-07, "logits/chosen": 0.02120272070169449, "logits/rejected": 0.5330603718757629, "logps/chosen": -230.68896484375, "logps/rejected": -202.53201293945312, "loss": 0.6871, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.004152396693825722, "rewards/margins": 0.01102996151894331, "rewards/margins_max": 0.016634680330753326, "rewards/margins_min": 0.005425242241472006, "rewards/margins_std": 0.007926270365715027, "rewards/rejected": -0.006877565290778875, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.470703125, "learning_rate": 1.0699588477366254e-06, "logits/chosen": 0.13731414079666138, "logits/rejected": 0.6454218626022339, "logps/chosen": -265.0476379394531, "logps/rejected": -232.1322479248047, "loss": 0.6851, "rewards/accuracies": 0.9375, "rewards/chosen": 0.008298086933791637, "rewards/margins": 0.01611563190817833, "rewards/margins_max": 0.0227045975625515, "rewards/margins_min": 0.009526659734547138, "rewards/margins_std": 0.009318210184574127, "rewards/rejected": -0.007817542180418968, "step": 130 }, { "epoch": 0.06, "grad_norm": 0.46875, "learning_rate": 1.1522633744855967e-06, "logits/chosen": 0.0916055217385292, "logits/rejected": 0.5900839567184448, "logps/chosen": -250.656005859375, "logps/rejected": -215.56851196289062, "loss": 0.6834, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.008936955593526363, "rewards/margins": 0.018578212708234787, "rewards/margins_max": 0.027034681290388107, "rewards/margins_min": 0.010121742263436317, "rewards/margins_std": 0.011959253810346127, "rewards/rejected": -0.009641257114708424, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.50390625, "learning_rate": 1.2345679012345677e-06, "logits/chosen": 0.02975723147392273, "logits/rejected": 0.7174798250198364, "logps/chosen": -273.29351806640625, "logps/rejected": -229.1119842529297, "loss": 0.6808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.016104739159345627, "rewards/margins": 0.025698691606521606, "rewards/margins_max": 0.037033237516880035, "rewards/margins_min": 0.014364147558808327, "rewards/margins_std": 0.016029467806220055, "rewards/rejected": -0.00959395244717598, "step": 150 }, { "epoch": 0.07, "grad_norm": 0.396484375, "learning_rate": 1.316872427983539e-06, "logits/chosen": 0.014136433601379395, "logits/rejected": 0.4981762766838074, "logps/chosen": -229.10940551757812, "logps/rejected": -197.9926300048828, "loss": 0.6811, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.014829346910119057, "rewards/margins": 0.026064058765769005, "rewards/margins_max": 0.03911607339978218, "rewards/margins_min": 0.01301204226911068, "rewards/margins_std": 0.01845833659172058, "rewards/rejected": -0.011234709993004799, "step": 160 }, { "epoch": 0.07, "grad_norm": 0.3984375, "learning_rate": 1.3991769547325102e-06, "logits/chosen": 0.057651955634355545, "logits/rejected": 0.5741917490959167, "logps/chosen": -231.759033203125, "logps/rejected": -235.8512420654297, "loss": 0.6767, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02400249056518078, "rewards/margins": 0.03238454461097717, "rewards/margins_max": 0.04800540953874588, "rewards/margins_min": 0.01676369085907936, "rewards/margins_std": 0.022091226652264595, "rewards/rejected": -0.008382054045796394, "step": 170 }, { "epoch": 0.07, "grad_norm": 0.375, "learning_rate": 1.4814814814814812e-06, "logits/chosen": 0.21878819167613983, "logits/rejected": 0.5812119245529175, "logps/chosen": -207.7227783203125, "logps/rejected": -216.443115234375, "loss": 0.6781, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014631425030529499, "rewards/margins": 0.029586512595415115, "rewards/margins_max": 0.042366527020931244, "rewards/margins_min": 0.016806500032544136, "rewards/margins_std": 0.018073670566082, "rewards/rejected": -0.014955088496208191, "step": 180 }, { "epoch": 0.08, "grad_norm": 0.50390625, "learning_rate": 1.5637860082304525e-06, "logits/chosen": 0.1410341113805771, "logits/rejected": 0.7296265363693237, "logps/chosen": -253.40957641601562, "logps/rejected": -219.5933837890625, "loss": 0.6718, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02252976968884468, "rewards/margins": 0.03989989683032036, "rewards/margins_max": 0.05754275247454643, "rewards/margins_min": 0.022257043048739433, "rewards/margins_std": 0.02495076134800911, "rewards/rejected": -0.017370129004120827, "step": 190 }, { "epoch": 0.08, "grad_norm": 0.474609375, "learning_rate": 1.646090534979424e-06, "logits/chosen": 0.050902754068374634, "logits/rejected": 0.6948504447937012, "logps/chosen": -269.0355529785156, "logps/rejected": -228.97781372070312, "loss": 0.6694, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.031442780047655106, "rewards/margins": 0.049986980855464935, "rewards/margins_max": 0.07442543655633926, "rewards/margins_min": 0.025548523291945457, "rewards/margins_std": 0.03456118702888489, "rewards/rejected": -0.018544193357229233, "step": 200 }, { "epoch": 0.09, "grad_norm": 0.46875, "learning_rate": 1.7283950617283948e-06, "logits/chosen": 0.055720794945955276, "logits/rejected": 0.5427404642105103, "logps/chosen": -244.46435546875, "logps/rejected": -219.5703887939453, "loss": 0.6705, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.029670244082808495, "rewards/margins": 0.050485529005527496, "rewards/margins_max": 0.07362981140613556, "rewards/margins_min": 0.027341246604919434, "rewards/margins_std": 0.03273095563054085, "rewards/rejected": -0.020815281197428703, "step": 210 }, { "epoch": 0.09, "grad_norm": 0.416015625, "learning_rate": 1.8106995884773662e-06, "logits/chosen": 0.09668431431055069, "logits/rejected": 0.6245120167732239, "logps/chosen": -235.32608032226562, "logps/rejected": -208.3621063232422, "loss": 0.665, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.030144259333610535, "rewards/margins": 0.059492819011211395, "rewards/margins_max": 0.0863434299826622, "rewards/margins_min": 0.03264220803976059, "rewards/margins_std": 0.037972498685121536, "rewards/rejected": -0.02934856340289116, "step": 220 }, { "epoch": 0.09, "grad_norm": 0.431640625, "learning_rate": 1.8930041152263375e-06, "logits/chosen": -0.016712257638573647, "logits/rejected": 0.5332568883895874, "logps/chosen": -288.5390319824219, "logps/rejected": -247.04513549804688, "loss": 0.6598, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03193775564432144, "rewards/margins": 0.07212281227111816, "rewards/margins_max": 0.10512430965900421, "rewards/margins_min": 0.039121340960264206, "rewards/margins_std": 0.04667114093899727, "rewards/rejected": -0.04018506780266762, "step": 230 }, { "epoch": 0.1, "grad_norm": 0.45703125, "learning_rate": 1.9753086419753083e-06, "logits/chosen": 0.020423922687768936, "logits/rejected": 0.5501176118850708, "logps/chosen": -246.56204223632812, "logps/rejected": -229.26943969726562, "loss": 0.6555, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.03230474889278412, "rewards/margins": 0.07798168063163757, "rewards/margins_max": 0.11025931686162949, "rewards/margins_min": 0.045704036951065063, "rewards/margins_std": 0.045647479593753815, "rewards/rejected": -0.045676928013563156, "step": 240 }, { "epoch": 0.1, "grad_norm": 0.458984375, "learning_rate": 1.999949352352126e-06, "logits/chosen": -0.020852217450737953, "logits/rejected": 0.5500503778457642, "logps/chosen": -271.888916015625, "logps/rejected": -256.8134765625, "loss": 0.6557, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.04405756667256355, "rewards/margins": 0.07757656276226044, "rewards/margins_max": 0.10984426736831665, "rewards/margins_min": 0.04530886188149452, "rewards/margins_std": 0.04563341662287712, "rewards/rejected": -0.033518996089696884, "step": 250 }, { "epoch": 0.11, "grad_norm": 0.443359375, "learning_rate": 1.999701294590502e-06, "logits/chosen": 0.1329582929611206, "logits/rejected": 0.7303738594055176, "logps/chosen": -267.12274169921875, "logps/rejected": -211.52099609375, "loss": 0.6531, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.022596510127186775, "rewards/margins": 0.0730314701795578, "rewards/margins_max": 0.11627304553985596, "rewards/margins_min": 0.029789889231324196, "rewards/margins_std": 0.061152826994657516, "rewards/rejected": -0.050434958189725876, "step": 260 }, { "epoch": 0.11, "grad_norm": 0.439453125, "learning_rate": 1.9992465753011367e-06, "logits/chosen": 0.02824712172150612, "logits/rejected": 0.6052254438400269, "logps/chosen": -287.28057861328125, "logps/rejected": -242.43618774414062, "loss": 0.643, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04510802775621414, "rewards/margins": 0.10490649938583374, "rewards/margins_max": 0.149946391582489, "rewards/margins_min": 0.059866636991500854, "rewards/margins_std": 0.0636960119009018, "rewards/rejected": -0.059798479080200195, "step": 270 }, { "epoch": 0.12, "grad_norm": 0.478515625, "learning_rate": 1.9985852884850918e-06, "logits/chosen": 0.16367292404174805, "logits/rejected": 0.6942164301872253, "logps/chosen": -255.4570770263672, "logps/rejected": -232.69760131835938, "loss": 0.6471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.027844402939081192, "rewards/margins": 0.09164806455373764, "rewards/margins_max": 0.12632620334625244, "rewards/margins_min": 0.05696992203593254, "rewards/margins_std": 0.04904230311512947, "rewards/rejected": -0.06380365788936615, "step": 280 }, { "epoch": 0.12, "grad_norm": 0.419921875, "learning_rate": 1.9977175708457446e-06, "logits/chosen": 0.1536540985107422, "logits/rejected": 0.6605737209320068, "logps/chosen": -240.44729614257812, "logps/rejected": -224.4777374267578, "loss": 0.644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02331436611711979, "rewards/margins": 0.11056084930896759, "rewards/margins_max": 0.1643027514219284, "rewards/margins_min": 0.05681893974542618, "rewards/margins_std": 0.07600252330303192, "rewards/rejected": -0.08724648505449295, "step": 290 }, { "epoch": 0.12, "grad_norm": 0.46484375, "learning_rate": 1.9966436017605294e-06, "logits/chosen": 0.07062125205993652, "logits/rejected": 0.6594554781913757, "logps/chosen": -252.1466064453125, "logps/rejected": -233.97787475585938, "loss": 0.6346, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.028239211067557335, "rewards/margins": 0.13431617617607117, "rewards/margins_max": 0.19096195697784424, "rewards/margins_min": 0.0776703953742981, "rewards/margins_std": 0.08010922372341156, "rewards/rejected": -0.10607695579528809, "step": 300 }, { "epoch": 0.13, "grad_norm": 0.408203125, "learning_rate": 1.995363603243855e-06, "logits/chosen": 0.2892570495605469, "logits/rejected": 0.6574875712394714, "logps/chosen": -216.6258087158203, "logps/rejected": -206.6038818359375, "loss": 0.6333, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.019283967092633247, "rewards/margins": 0.121584951877594, "rewards/margins_max": 0.1772836297750473, "rewards/margins_min": 0.06588628143072128, "rewards/margins_std": 0.07876982539892197, "rewards/rejected": -0.1023009866476059, "step": 310 }, { "epoch": 0.13, "grad_norm": 0.45703125, "learning_rate": 1.9938778399012094e-06, "logits/chosen": 0.1439567506313324, "logits/rejected": 0.6365154385566711, "logps/chosen": -236.16378784179688, "logps/rejected": -216.11618041992188, "loss": 0.6301, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011489281430840492, "rewards/margins": 0.13021530210971832, "rewards/margins_max": 0.18579894304275513, "rewards/margins_min": 0.07463165372610092, "rewards/margins_std": 0.07860714942216873, "rewards/rejected": -0.11872602999210358, "step": 320 }, { "epoch": 0.14, "grad_norm": 0.455078125, "learning_rate": 1.9921866188744596e-06, "logits/chosen": 0.03234120458364487, "logits/rejected": 0.6771044731140137, "logps/chosen": -229.37313842773438, "logps/rejected": -193.59500122070312, "loss": 0.6243, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01423537265509367, "rewards/margins": 0.14501173794269562, "rewards/margins_max": 0.19469766318798065, "rewards/margins_min": 0.09532581269741058, "rewards/margins_std": 0.07026650756597519, "rewards/rejected": -0.13077637553215027, "step": 330 }, { "epoch": 0.14, "grad_norm": 0.466796875, "learning_rate": 1.990290289778359e-06, "logits/chosen": 0.2403167188167572, "logits/rejected": 0.7011794447898865, "logps/chosen": -252.5185089111328, "logps/rejected": -227.126708984375, "loss": 0.6276, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0113031892105937, "rewards/margins": 0.128191739320755, "rewards/margins_max": 0.19695988297462463, "rewards/margins_min": 0.05942361429333687, "rewards/margins_std": 0.09725283086299896, "rewards/rejected": -0.13949494063854218, "step": 340 }, { "epoch": 0.14, "grad_norm": 0.4765625, "learning_rate": 1.988189244628272e-06, "logits/chosen": 0.10928988456726074, "logits/rejected": 0.6726782917976379, "logps/chosen": -255.8716278076172, "logps/rejected": -242.6270751953125, "loss": 0.6212, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0170623529702425, "rewards/margins": 0.17394407093524933, "rewards/margins_max": 0.25714507699012756, "rewards/margins_min": 0.0907430499792099, "rewards/margins_std": 0.11766400188207626, "rewards/rejected": -0.15688170492649078, "step": 350 }, { "epoch": 0.15, "grad_norm": 0.5625, "learning_rate": 1.9858839177591384e-06, "logits/chosen": 0.1698768585920334, "logits/rejected": 0.7562354803085327, "logps/chosen": -246.46035766601562, "logps/rejected": -253.3776092529297, "loss": 0.6066, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.019840896129608154, "rewards/margins": 0.19313883781433105, "rewards/margins_max": 0.2668205201625824, "rewards/margins_min": 0.11945716291666031, "rewards/margins_std": 0.10420163720846176, "rewards/rejected": -0.1732979416847229, "step": 360 }, { "epoch": 0.15, "grad_norm": 0.427734375, "learning_rate": 1.9833747857356827e-06, "logits/chosen": 0.0925469920039177, "logits/rejected": 0.6798457503318787, "logps/chosen": -227.651123046875, "logps/rejected": -220.4510498046875, "loss": 0.6026, "rewards/accuracies": 1.0, "rewards/chosen": 0.004902270622551441, "rewards/margins": 0.19105985760688782, "rewards/margins_max": 0.27683204412460327, "rewards/margins_min": 0.10528764873743057, "rewards/margins_std": 0.12130022048950195, "rewards/rejected": -0.18615756928920746, "step": 370 }, { "epoch": 0.16, "grad_norm": 0.43359375, "learning_rate": 1.9806623672538997e-06, "logits/chosen": 0.0311798807233572, "logits/rejected": 0.5755618810653687, "logps/chosen": -231.2305450439453, "logps/rejected": -229.2065887451172, "loss": 0.606, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.006447208113968372, "rewards/margins": 0.17201881110668182, "rewards/margins_max": 0.2625262141227722, "rewards/margins_min": 0.08151140064001083, "rewards/margins_std": 0.12799681723117828, "rewards/rejected": -0.16557160019874573, "step": 380 }, { "epoch": 0.16, "grad_norm": 0.50390625, "learning_rate": 1.9777472230338267e-06, "logits/chosen": 0.015010332688689232, "logits/rejected": 0.6248622536659241, "logps/chosen": -252.9764404296875, "logps/rejected": -239.882568359375, "loss": 0.587, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.004931238479912281, "rewards/margins": 0.2389887571334839, "rewards/margins_max": 0.34165245294570923, "rewards/margins_min": 0.13632504642009735, "rewards/margins_std": 0.14518840610980988, "rewards/rejected": -0.23405751585960388, "step": 390 }, { "epoch": 0.16, "grad_norm": 0.3984375, "learning_rate": 1.9746299557036303e-06, "logits/chosen": 0.10073033720254898, "logits/rejected": 0.8144109845161438, "logps/chosen": -293.6244201660156, "logps/rejected": -236.57235717773438, "loss": 0.5871, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.005161326378583908, "rewards/margins": 0.2277032881975174, "rewards/margins_max": 0.33263522386550903, "rewards/margins_min": 0.12277133762836456, "rewards/margins_std": 0.14839616417884827, "rewards/rejected": -0.2328646183013916, "step": 400 }, { "epoch": 0.17, "grad_norm": 0.51953125, "learning_rate": 1.9713112096750285e-06, "logits/chosen": -0.02684302069246769, "logits/rejected": 0.59294593334198, "logps/chosen": -248.1615753173828, "logps/rejected": -245.7884979248047, "loss": 0.5744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.011066530831158161, "rewards/margins": 0.2501886487007141, "rewards/margins_max": 0.3555333614349365, "rewards/margins_min": 0.1448439061641693, "rewards/margins_std": 0.14897994697093964, "rewards/rejected": -0.23912210762500763, "step": 410 }, { "epoch": 0.17, "grad_norm": 0.5078125, "learning_rate": 1.967791671010076e-06, "logits/chosen": 0.14378827810287476, "logits/rejected": 0.6853595972061157, "logps/chosen": -264.4352722167969, "logps/rejected": -287.1185607910156, "loss": 0.5597, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.02877393364906311, "rewards/margins": 0.3156929314136505, "rewards/margins_max": 0.4402744770050049, "rewards/margins_min": 0.19111141562461853, "rewards/margins_std": 0.17618489265441895, "rewards/rejected": -0.344466894865036, "step": 420 }, { "epoch": 0.18, "grad_norm": 0.5234375, "learning_rate": 1.96407206727934e-06, "logits/chosen": 0.010093556717038155, "logits/rejected": 0.5722212195396423, "logps/chosen": -261.48260498046875, "logps/rejected": -242.7886505126953, "loss": 0.5802, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.03540874272584915, "rewards/margins": 0.2602460980415344, "rewards/margins_max": 0.41468754410743713, "rewards/margins_min": 0.1058046966791153, "rewards/margins_std": 0.21841315925121307, "rewards/rejected": -0.29565486311912537, "step": 430 }, { "epoch": 0.18, "grad_norm": 0.458984375, "learning_rate": 1.9601531674114928e-06, "logits/chosen": 0.1727772355079651, "logits/rejected": 0.7151114344596863, "logps/chosen": -261.04229736328125, "logps/rejected": -249.06704711914062, "loss": 0.5599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.037648189812898636, "rewards/margins": 0.3025735020637512, "rewards/margins_max": 0.4183417856693268, "rewards/margins_min": 0.18680524826049805, "rewards/margins_std": 0.1637210100889206, "rewards/rejected": -0.34022170305252075, "step": 440 }, { "epoch": 0.19, "grad_norm": 0.6015625, "learning_rate": 1.9560357815343576e-06, "logits/chosen": -0.02275443822145462, "logits/rejected": 0.6079251766204834, "logps/chosen": -291.15667724609375, "logps/rejected": -295.98114013671875, "loss": 0.5534, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.03242644667625427, "rewards/margins": 0.3713623881340027, "rewards/margins_max": 0.5583276748657227, "rewards/margins_min": 0.18439707159996033, "rewards/margins_std": 0.2644089162349701, "rewards/rejected": -0.40378880500793457, "step": 450 }, { "epoch": 0.19, "grad_norm": 0.5390625, "learning_rate": 1.9517207608074365e-06, "logits/chosen": 0.010817606933414936, "logits/rejected": 0.457902729511261, "logps/chosen": -247.76101684570312, "logps/rejected": -259.99969482421875, "loss": 0.5481, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.046189289540052414, "rewards/margins": 0.35407206416130066, "rewards/margins_max": 0.5428398251533508, "rewards/margins_min": 0.16530433297157288, "rewards/margins_std": 0.2669579088687897, "rewards/rejected": -0.40026140213012695, "step": 460 }, { "epoch": 0.19, "grad_norm": 0.4765625, "learning_rate": 1.9472089972459547e-06, "logits/chosen": -0.01565355248749256, "logits/rejected": 0.6114306449890137, "logps/chosen": -269.9651794433594, "logps/rejected": -249.69210815429688, "loss": 0.5448, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06878812611103058, "rewards/margins": 0.32420676946640015, "rewards/margins_max": 0.5209608674049377, "rewards/margins_min": 0.12745265662670135, "rewards/margins_std": 0.2782523036003113, "rewards/rejected": -0.39299488067626953, "step": 470 }, { "epoch": 0.2, "grad_norm": 0.5703125, "learning_rate": 1.942501423536461e-06, "logits/chosen": 0.1253264844417572, "logits/rejected": 0.615772545337677, "logps/chosen": -234.8943634033203, "logps/rejected": -258.9188232421875, "loss": 0.5388, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08462107926607132, "rewards/margins": 0.3709794580936432, "rewards/margins_max": 0.5461065173149109, "rewards/margins_min": 0.19585230946540833, "rewards/margins_std": 0.24766714870929718, "rewards/rejected": -0.45560044050216675, "step": 480 }, { "epoch": 0.2, "grad_norm": 0.5625, "learning_rate": 1.93759901284402e-06, "logits/chosen": 0.05123847723007202, "logits/rejected": 0.5316873788833618, "logps/chosen": -255.9114532470703, "logps/rejected": -303.42718505859375, "loss": 0.5123, "rewards/accuracies": 1.0, "rewards/chosen": -0.06182605028152466, "rewards/margins": 0.5358118414878845, "rewards/margins_max": 0.822005569934845, "rewards/margins_min": 0.24961814284324646, "rewards/margins_std": 0.40473905205726624, "rewards/rejected": -0.597637951374054, "step": 490 }, { "epoch": 0.21, "grad_norm": 0.6328125, "learning_rate": 1.932502778611036e-06, "logits/chosen": -0.022655535489320755, "logits/rejected": 0.6211397647857666, "logps/chosen": -233.3824462890625, "logps/rejected": -231.17599487304688, "loss": 0.5255, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07823699712753296, "rewards/margins": 0.4163808822631836, "rewards/margins_max": 0.658298134803772, "rewards/margins_min": 0.1744636446237564, "rewards/margins_std": 0.3421226441860199, "rewards/rejected": -0.49461787939071655, "step": 500 }, { "epoch": 0.21, "grad_norm": 0.61328125, "learning_rate": 1.9272137743477504e-06, "logits/chosen": 0.20029589533805847, "logits/rejected": 0.7615786790847778, "logps/chosen": -251.86819458007812, "logps/rejected": -269.39971923828125, "loss": 0.4883, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10473309457302094, "rewards/margins": 0.5166608095169067, "rewards/margins_max": 0.7467631101608276, "rewards/margins_min": 0.28655844926834106, "rewards/margins_std": 0.32541388273239136, "rewards/rejected": -0.6213939189910889, "step": 510 }, { "epoch": 0.21, "grad_norm": 0.57421875, "learning_rate": 1.9217330934144564e-06, "logits/chosen": 0.03549078106880188, "logits/rejected": 0.6382437348365784, "logps/chosen": -269.1963806152344, "logps/rejected": -284.4259338378906, "loss": 0.5078, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13496743142604828, "rewards/margins": 0.5025084614753723, "rewards/margins_max": 0.7574218511581421, "rewards/margins_min": 0.24759499728679657, "rewards/margins_std": 0.3605020344257355, "rewards/rejected": -0.637475848197937, "step": 520 }, { "epoch": 0.22, "grad_norm": 0.66796875, "learning_rate": 1.916061868795478e-06, "logits/chosen": 0.2350475788116455, "logits/rejected": 0.724064290523529, "logps/chosen": -263.99786376953125, "logps/rejected": -286.0734558105469, "loss": 0.5188, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15573835372924805, "rewards/margins": 0.44272828102111816, "rewards/margins_max": 0.688580334186554, "rewards/margins_min": 0.19687625765800476, "rewards/margins_std": 0.3476872742176056, "rewards/rejected": -0.598466694355011, "step": 530 }, { "epoch": 0.22, "grad_norm": 0.6953125, "learning_rate": 1.910201272864954e-06, "logits/chosen": 0.1834249347448349, "logits/rejected": 0.7275029420852661, "logps/chosen": -267.31304931640625, "logps/rejected": -274.42120361328125, "loss": 0.4977, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.1938314139842987, "rewards/margins": 0.532240092754364, "rewards/margins_max": 0.8296705484390259, "rewards/margins_min": 0.23480959236621857, "rewards/margins_std": 0.42063021659851074, "rewards/rejected": -0.7260714769363403, "step": 540 }, { "epoch": 0.23, "grad_norm": 0.578125, "learning_rate": 1.9041525171444798e-06, "logits/chosen": -0.012256382033228874, "logits/rejected": 0.5923458933830261, "logps/chosen": -266.64044189453125, "logps/rejected": -261.02215576171875, "loss": 0.5211, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.19421935081481934, "rewards/margins": 0.3819182515144348, "rewards/margins_max": 0.5985667705535889, "rewards/margins_min": 0.16526973247528076, "rewards/margins_std": 0.3063872456550598, "rewards/rejected": -0.5761376023292542, "step": 550 }, { "epoch": 0.23, "grad_norm": 0.7265625, "learning_rate": 1.897916852052661e-06, "logits/chosen": -0.12678642570972443, "logits/rejected": 0.4983861446380615, "logps/chosen": -287.7276306152344, "logps/rejected": -330.13482666015625, "loss": 0.4613, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2061140239238739, "rewards/margins": 0.6443026065826416, "rewards/margins_max": 0.9833240509033203, "rewards/margins_min": 0.30528122186660767, "rewards/margins_std": 0.4794486463069916, "rewards/rejected": -0.8504166603088379, "step": 560 }, { "epoch": 0.23, "grad_norm": 0.56640625, "learning_rate": 1.8914955666466205e-06, "logits/chosen": 0.03088958188891411, "logits/rejected": 0.6151102781295776, "logps/chosen": -258.5025634765625, "logps/rejected": -306.0191650390625, "loss": 0.4808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18039652705192566, "rewards/margins": 0.6223662495613098, "rewards/margins_max": 0.9027007222175598, "rewards/margins_min": 0.3420317769050598, "rewards/margins_std": 0.39645272493362427, "rewards/rejected": -0.8027628064155579, "step": 570 }, { "epoch": 0.24, "grad_norm": 0.625, "learning_rate": 1.8848899883555203e-06, "logits/chosen": 0.09567205607891083, "logits/rejected": 0.7999943494796753, "logps/chosen": -286.7126770019531, "logps/rejected": -330.4248962402344, "loss": 0.4626, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2216329574584961, "rewards/margins": 0.682709813117981, "rewards/margins_max": 1.0534820556640625, "rewards/margins_min": 0.3119375705718994, "rewards/margins_std": 0.5243510007858276, "rewards/rejected": -0.904342770576477, "step": 580 }, { "epoch": 0.24, "grad_norm": 0.60546875, "learning_rate": 1.8781014827061518e-06, "logits/chosen": 0.051412492990493774, "logits/rejected": 0.7583128213882446, "logps/chosen": -257.7097473144531, "logps/rejected": -265.9875183105469, "loss": 0.4877, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.2396390736103058, "rewards/margins": 0.5867950916290283, "rewards/margins_max": 0.9322711825370789, "rewards/margins_min": 0.24131877720355988, "rewards/margins_std": 0.4885772168636322, "rewards/rejected": -0.8264341354370117, "step": 590 }, { "epoch": 0.25, "grad_norm": 0.62890625, "learning_rate": 1.8711314530406498e-06, "logits/chosen": 0.02027386799454689, "logits/rejected": 0.6661813259124756, "logps/chosen": -280.32208251953125, "logps/rejected": -305.6834411621094, "loss": 0.4487, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2480877935886383, "rewards/margins": 0.7212269306182861, "rewards/margins_max": 1.1429131031036377, "rewards/margins_min": 0.29954075813293457, "rewards/margins_std": 0.5963543653488159, "rewards/rejected": -0.969314694404602, "step": 600 }, { "epoch": 0.25, "grad_norm": 0.66796875, "learning_rate": 1.8639813402263877e-06, "logits/chosen": -0.020800206810235977, "logits/rejected": 0.6128827333450317, "logps/chosen": -306.2142333984375, "logps/rejected": -309.4848327636719, "loss": 0.4649, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3131290376186371, "rewards/margins": 0.7527261972427368, "rewards/margins_max": 1.202072024345398, "rewards/margins_min": 0.30338022112846375, "rewards/margins_std": 0.6354711055755615, "rewards/rejected": -1.0658552646636963, "step": 610 }, { "epoch": 0.26, "grad_norm": 0.78515625, "learning_rate": 1.8566526223581192e-06, "logits/chosen": 0.039588745683431625, "logits/rejected": 0.5985323786735535, "logps/chosen": -284.71929931640625, "logps/rejected": -321.4588928222656, "loss": 0.4402, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3328380584716797, "rewards/margins": 0.8725109100341797, "rewards/margins_max": 1.4007904529571533, "rewards/margins_min": 0.3442313075065613, "rewards/margins_std": 0.7471002340316772, "rewards/rejected": -1.2053489685058594, "step": 620 }, { "epoch": 0.26, "grad_norm": 0.58984375, "learning_rate": 1.8491468144524177e-06, "logits/chosen": -0.07715997099876404, "logits/rejected": 0.4883267283439636, "logps/chosen": -325.30877685546875, "logps/rejected": -369.89923095703125, "loss": 0.4566, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.43882113695144653, "rewards/margins": 0.86943519115448, "rewards/margins_max": 1.459219217300415, "rewards/margins_min": 0.2796511948108673, "rewards/margins_std": 0.8340805768966675, "rewards/rejected": -1.3082562685012817, "step": 630 }, { "epoch": 0.26, "grad_norm": 0.59765625, "learning_rate": 1.8414654681344916e-06, "logits/chosen": -0.09930239617824554, "logits/rejected": 0.520045280456543, "logps/chosen": -286.84912109375, "logps/rejected": -331.6852111816406, "loss": 0.4534, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4153270125389099, "rewards/margins": 0.733604907989502, "rewards/margins_max": 1.1434333324432373, "rewards/margins_min": 0.32377633452415466, "rewards/margins_std": 0.579585075378418, "rewards/rejected": -1.1489319801330566, "step": 640 }, { "epoch": 0.27, "grad_norm": 0.640625, "learning_rate": 1.833610171317424e-06, "logits/chosen": 0.07414983212947845, "logits/rejected": 0.6430131196975708, "logps/chosen": -300.6047668457031, "logps/rejected": -354.2503967285156, "loss": 0.4345, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4335756301879883, "rewards/margins": 0.7363289594650269, "rewards/margins_max": 1.2120670080184937, "rewards/margins_min": 0.2605907917022705, "rewards/margins_std": 0.6727953553199768, "rewards/rejected": -1.1699045896530151, "step": 650 }, { "epoch": 0.27, "grad_norm": 0.5859375, "learning_rate": 1.8255825478739157e-06, "logits/chosen": 0.15367427468299866, "logits/rejected": 0.6044631004333496, "logps/chosen": -254.28286743164062, "logps/rejected": -345.909912109375, "loss": 0.4196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.33262401819229126, "rewards/margins": 0.8059485554695129, "rewards/margins_max": 1.1957746744155884, "rewards/margins_min": 0.4161224365234375, "rewards/margins_std": 0.5512973666191101, "rewards/rejected": -1.1385724544525146, "step": 660 }, { "epoch": 0.28, "grad_norm": 0.68359375, "learning_rate": 1.8173842573005922e-06, "logits/chosen": -0.05639176443219185, "logits/rejected": 0.4384500980377197, "logps/chosen": -290.5333557128906, "logps/rejected": -347.9354553222656, "loss": 0.4435, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5654058456420898, "rewards/margins": 0.7975835204124451, "rewards/margins_max": 1.297826886177063, "rewards/margins_min": 0.2973402142524719, "rewards/margins_std": 0.707450807094574, "rewards/rejected": -1.3629894256591797, "step": 670 }, { "epoch": 0.28, "grad_norm": 0.72265625, "learning_rate": 1.8090169943749474e-06, "logits/chosen": 0.07231085002422333, "logits/rejected": 0.709048330783844, "logps/chosen": -317.9612731933594, "logps/rejected": -366.837890625, "loss": 0.4035, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4872046113014221, "rewards/margins": 0.969443678855896, "rewards/margins_max": 1.5566800832748413, "rewards/margins_min": 0.3822072446346283, "rewards/margins_std": 0.8304777145385742, "rewards/rejected": -1.456648349761963, "step": 680 }, { "epoch": 0.28, "grad_norm": 0.6953125, "learning_rate": 1.8004824888049936e-06, "logits/chosen": -0.02150675281882286, "logits/rejected": 0.6057881116867065, "logps/chosen": -290.603271484375, "logps/rejected": -371.76788330078125, "loss": 0.4219, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.45583152770996094, "rewards/margins": 1.104190707206726, "rewards/margins_max": 1.8512020111083984, "rewards/margins_min": 0.3571794033050537, "rewards/margins_std": 1.0564334392547607, "rewards/rejected": -1.560022234916687, "step": 690 }, { "epoch": 0.29, "grad_norm": 0.76171875, "learning_rate": 1.791782504871691e-06, "logits/chosen": -0.041911929845809937, "logits/rejected": 0.615075409412384, "logps/chosen": -325.01495361328125, "logps/rejected": -324.512451171875, "loss": 0.4271, "rewards/accuracies": 0.875, "rewards/chosen": -0.687096118927002, "rewards/margins": 0.7125922441482544, "rewards/margins_max": 1.197318196296692, "rewards/margins_min": 0.22786636650562286, "rewards/margins_std": 0.6855059266090393, "rewards/rejected": -1.3996882438659668, "step": 700 }, { "epoch": 0.29, "grad_norm": 0.7265625, "learning_rate": 1.7829188410642288e-06, "logits/chosen": 0.03392393887042999, "logits/rejected": 0.7335731983184814, "logps/chosen": -328.3592224121094, "logps/rejected": -401.75653076171875, "loss": 0.3674, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5749274492263794, "rewards/margins": 1.163051962852478, "rewards/margins_max": 1.8984100818634033, "rewards/margins_min": 0.42769408226013184, "rewards/margins_std": 1.0399531126022339, "rewards/rejected": -1.737979531288147, "step": 710 }, { "epoch": 0.3, "grad_norm": 0.578125, "learning_rate": 1.7738933297082363e-06, "logits/chosen": -0.004636755678802729, "logits/rejected": 0.5097047090530396, "logps/chosen": -307.2280578613281, "logps/rejected": -362.5736389160156, "loss": 0.3987, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6072796583175659, "rewards/margins": 1.077012300491333, "rewards/margins_max": 1.6295830011367798, "rewards/margins_min": 0.5244414806365967, "rewards/margins_std": 0.781453013420105, "rewards/rejected": -1.6842920780181885, "step": 720 }, { "epoch": 0.3, "grad_norm": 0.71484375, "learning_rate": 1.7647078365869988e-06, "logits/chosen": -0.08409127593040466, "logits/rejected": 0.4622929096221924, "logps/chosen": -307.9881896972656, "logps/rejected": -387.77264404296875, "loss": 0.3973, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5732977986335754, "rewards/margins": 0.9682035446166992, "rewards/margins_max": 1.5226585865020752, "rewards/margins_min": 0.41374826431274414, "rewards/margins_std": 0.7841179966926575, "rewards/rejected": -1.5415012836456299, "step": 730 }, { "epoch": 0.3, "grad_norm": 0.6953125, "learning_rate": 1.7553642605557558e-06, "logits/chosen": 0.028938591480255127, "logits/rejected": 0.5773764848709106, "logps/chosen": -324.56439208984375, "logps/rejected": -408.5997314453125, "loss": 0.3654, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5612246990203857, "rewards/margins": 1.1907762289047241, "rewards/margins_max": 1.8759260177612305, "rewards/margins_min": 0.5056263208389282, "rewards/margins_std": 0.968948245048523, "rewards/rejected": -1.7520010471343994, "step": 740 }, { "epoch": 0.31, "grad_norm": 0.7265625, "learning_rate": 1.745864533149165e-06, "logits/chosen": -0.1704981029033661, "logits/rejected": 0.4236753582954407, "logps/chosen": -336.75787353515625, "logps/rejected": -425.0238342285156, "loss": 0.4012, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7402617335319519, "rewards/margins": 1.3461772203445435, "rewards/margins_max": 2.0805575847625732, "rewards/margins_min": 0.6117968559265137, "rewards/margins_std": 1.0385706424713135, "rewards/rejected": -2.0864386558532715, "step": 750 }, { "epoch": 0.31, "grad_norm": 0.8984375, "learning_rate": 1.7362106181820062e-06, "logits/chosen": -0.05409733206033707, "logits/rejected": 0.6319289803504944, "logps/chosen": -323.01031494140625, "logps/rejected": -408.16046142578125, "loss": 0.4064, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6317715048789978, "rewards/margins": 1.3276995420455933, "rewards/margins_max": 2.0891098976135254, "rewards/margins_min": 0.566289484500885, "rewards/margins_std": 1.076796531677246, "rewards/rejected": -1.9594709873199463, "step": 760 }, { "epoch": 0.32, "grad_norm": 0.66796875, "learning_rate": 1.7264045113432197e-06, "logits/chosen": -0.05517064407467842, "logits/rejected": 0.5958997011184692, "logps/chosen": -377.96453857421875, "logps/rejected": -445.7513732910156, "loss": 0.3622, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7109798192977905, "rewards/margins": 1.3723578453063965, "rewards/margins_max": 2.0370492935180664, "rewards/margins_min": 0.7076665163040161, "rewards/margins_std": 0.9400156140327454, "rewards/rejected": -2.0833375453948975, "step": 770 }, { "epoch": 0.32, "grad_norm": 0.984375, "learning_rate": 1.7164482397833462e-06, "logits/chosen": 0.047315459698438644, "logits/rejected": 0.658032238483429, "logps/chosen": -330.7193298339844, "logps/rejected": -425.60394287109375, "loss": 0.3565, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7782403826713562, "rewards/margins": 1.2866474390029907, "rewards/margins_max": 1.9514182806015015, "rewards/margins_min": 0.6218767762184143, "rewards/margins_std": 0.9401277303695679, "rewards/rejected": -2.064887762069702, "step": 780 }, { "epoch": 0.33, "grad_norm": 0.81640625, "learning_rate": 1.70634386169547e-06, "logits/chosen": -0.034783005714416504, "logits/rejected": 0.5936989188194275, "logps/chosen": -369.987548828125, "logps/rejected": -421.5458068847656, "loss": 0.4285, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8623000979423523, "rewards/margins": 1.0687025785446167, "rewards/margins_max": 1.701873779296875, "rewards/margins_min": 0.43553146719932556, "rewards/margins_std": 0.8954392671585083, "rewards/rejected": -1.9310028553009033, "step": 790 }, { "epoch": 0.33, "grad_norm": 1.1796875, "learning_rate": 1.696093465889743e-06, "logits/chosen": -0.0015446215402334929, "logits/rejected": 0.5988802313804626, "logps/chosen": -329.92071533203125, "logps/rejected": -458.45196533203125, "loss": 0.3701, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9185296297073364, "rewards/margins": 1.5133209228515625, "rewards/margins_max": 2.527653694152832, "rewards/margins_min": 0.49898791313171387, "rewards/margins_std": 1.4344834089279175, "rewards/rejected": -2.4318506717681885, "step": 800 }, { "epoch": 0.33, "grad_norm": 0.91796875, "learning_rate": 1.6856991713615775e-06, "logits/chosen": -0.036314308643341064, "logits/rejected": 0.5499120354652405, "logps/chosen": -344.57598876953125, "logps/rejected": -425.4794921875, "loss": 0.3308, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8832759857177734, "rewards/margins": 1.2336915731430054, "rewards/margins_max": 1.9916486740112305, "rewards/margins_min": 0.4757346212863922, "rewards/margins_std": 1.0719130039215088, "rewards/rejected": -2.1169676780700684, "step": 810 }, { "epoch": 0.34, "grad_norm": 1.2421875, "learning_rate": 1.6751631268536018e-06, "logits/chosen": -0.07272686064243317, "logits/rejected": 0.5183537602424622, "logps/chosen": -353.64892578125, "logps/rejected": -507.5834045410156, "loss": 0.3119, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9213719367980957, "rewards/margins": 1.8004786968231201, "rewards/margins_max": 2.6694061756134033, "rewards/margins_min": 0.9315509796142578, "rewards/margins_std": 1.2288492918014526, "rewards/rejected": -2.721850872039795, "step": 820 }, { "epoch": 0.34, "grad_norm": 0.8359375, "learning_rate": 1.664487510411464e-06, "logits/chosen": -0.09257794171571732, "logits/rejected": 0.5011342167854309, "logps/chosen": -348.8028259277344, "logps/rejected": -537.207275390625, "loss": 0.3326, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.0882186889648438, "rewards/margins": 2.0627102851867676, "rewards/margins_max": 3.325389862060547, "rewards/margins_min": 0.8000311851501465, "rewards/margins_std": 1.7856981754302979, "rewards/rejected": -3.1509292125701904, "step": 830 }, { "epoch": 0.35, "grad_norm": 0.78515625, "learning_rate": 1.65367452893358e-06, "logits/chosen": -0.002668508794158697, "logits/rejected": 0.6281024813652039, "logps/chosen": -369.59014892578125, "logps/rejected": -580.8179931640625, "loss": 0.3385, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1532074213027954, "rewards/margins": 2.41599702835083, "rewards/margins_max": 3.841174364089966, "rewards/margins_min": 0.9908199310302734, "rewards/margins_std": 2.015505313873291, "rewards/rejected": -3.569204807281494, "step": 840 }, { "epoch": 0.35, "grad_norm": 1.4921875, "learning_rate": 1.6427264177149165e-06, "logits/chosen": 0.042776815593242645, "logits/rejected": 0.6086363792419434, "logps/chosen": -355.4585876464844, "logps/rejected": -518.1069946289062, "loss": 0.3465, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1320966482162476, "rewards/margins": 1.8160098791122437, "rewards/margins_max": 2.7302584648132324, "rewards/margins_min": 0.9017614126205444, "rewards/margins_std": 1.2929426431655884, "rewards/rejected": -2.9481067657470703, "step": 850 }, { "epoch": 0.35, "grad_norm": 0.66015625, "learning_rate": 1.6316454399849025e-06, "logits/chosen": 0.07836954295635223, "logits/rejected": 0.7413384914398193, "logps/chosen": -389.7508544921875, "logps/rejected": -541.7508544921875, "loss": 0.3002, "rewards/accuracies": 0.9375, "rewards/chosen": -1.192826509475708, "rewards/margins": 1.808393120765686, "rewards/margins_max": 2.9427132606506348, "rewards/margins_min": 0.674072802066803, "rewards/margins_std": 1.6041711568832397, "rewards/rejected": -3.0012195110321045, "step": 860 }, { "epoch": 0.36, "grad_norm": 1.546875, "learning_rate": 1.620433886439568e-06, "logits/chosen": 0.029862603172659874, "logits/rejected": 0.5954081416130066, "logps/chosen": -338.1944885253906, "logps/rejected": -488.2452087402344, "loss": 0.3331, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9371468424797058, "rewards/margins": 1.646079659461975, "rewards/margins_max": 2.532169818878174, "rewards/margins_min": 0.7599895596504211, "rewards/margins_std": 1.25312077999115, "rewards/rejected": -2.5832266807556152, "step": 870 }, { "epoch": 0.36, "grad_norm": 0.6171875, "learning_rate": 1.6090940747680032e-06, "logits/chosen": -0.009293178096413612, "logits/rejected": 0.6269145607948303, "logps/chosen": -374.23455810546875, "logps/rejected": -615.919189453125, "loss": 0.3139, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2220368385314941, "rewards/margins": 2.7851784229278564, "rewards/margins_max": 4.606410980224609, "rewards/margins_min": 0.963945746421814, "rewards/margins_std": 2.5756115913391113, "rewards/rejected": -4.0072150230407715, "step": 880 }, { "epoch": 0.37, "grad_norm": 0.58203125, "learning_rate": 1.5976283491732386e-06, "logits/chosen": -0.046679772436618805, "logits/rejected": 0.6183110475540161, "logps/chosen": -391.77337646484375, "logps/rejected": -539.9234619140625, "loss": 0.2852, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.5056828260421753, "rewards/margins": 2.0037107467651367, "rewards/margins_max": 3.213691234588623, "rewards/margins_min": 0.7937299609184265, "rewards/margins_std": 1.7111711502075195, "rewards/rejected": -3.5093936920166016, "step": 890 }, { "epoch": 0.37, "grad_norm": 1.2421875, "learning_rate": 1.5860390798876432e-06, "logits/chosen": 0.10775299370288849, "logits/rejected": 0.7166673541069031, "logps/chosen": -399.3635559082031, "logps/rejected": -538.3640747070312, "loss": 0.3538, "rewards/accuracies": 0.875, "rewards/chosen": -1.2117259502410889, "rewards/margins": 1.6585693359375, "rewards/margins_max": 2.707691192626953, "rewards/margins_min": 0.6094473004341125, "rewards/margins_std": 1.4836825132369995, "rewards/rejected": -2.870295286178589, "step": 900 }, { "epoch": 0.37, "grad_norm": 0.59765625, "learning_rate": 1.5743286626829435e-06, "logits/chosen": 0.022806577384471893, "logits/rejected": 0.5851965546607971, "logps/chosen": -390.1748962402344, "logps/rejected": -625.4842529296875, "loss": 0.3135, "rewards/accuracies": 0.875, "rewards/chosen": -1.6653339862823486, "rewards/margins": 2.4102187156677246, "rewards/margins_max": 4.1714911460876465, "rewards/margins_min": 0.6489461660385132, "rewards/margins_std": 2.4908154010772705, "rewards/rejected": -4.075552463531494, "step": 910 }, { "epoch": 0.38, "grad_norm": 1.0546875, "learning_rate": 1.5624995183749601e-06, "logits/chosen": -0.08865977823734283, "logits/rejected": 0.5650321245193481, "logps/chosen": -385.13409423828125, "logps/rejected": -572.8970336914062, "loss": 0.3019, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.4613841772079468, "rewards/margins": 2.3046936988830566, "rewards/margins_max": 3.730020523071289, "rewards/margins_min": 0.8793666958808899, "rewards/margins_std": 2.015716552734375, "rewards/rejected": -3.766078233718872, "step": 920 }, { "epoch": 0.38, "grad_norm": 1.421875, "learning_rate": 1.5505540923231695e-06, "logits/chosen": 0.09656616300344467, "logits/rejected": 0.7050750851631165, "logps/chosen": -410.744873046875, "logps/rejected": -602.7825927734375, "loss": 0.2936, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.506037950515747, "rewards/margins": 2.2272439002990723, "rewards/margins_max": 3.3739781379699707, "rewards/margins_min": 1.0805096626281738, "rewards/margins_std": 1.6217267513275146, "rewards/rejected": -3.7332820892333984, "step": 930 }, { "epoch": 0.39, "grad_norm": 2.40625, "learning_rate": 1.5384948539251919e-06, "logits/chosen": -0.046519361436367035, "logits/rejected": 0.5364492535591125, "logps/chosen": -407.41961669921875, "logps/rejected": -627.1873168945312, "loss": 0.3125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7679170370101929, "rewards/margins": 2.495460033416748, "rewards/margins_max": 4.152594566345215, "rewards/margins_min": 0.8383258581161499, "rewards/margins_std": 2.3435416221618652, "rewards/rejected": -4.2633771896362305, "step": 940 }, { "epoch": 0.39, "grad_norm": 0.65234375, "learning_rate": 1.5263242961063074e-06, "logits/chosen": -0.01217577327042818, "logits/rejected": 0.7021702527999878, "logps/chosen": -418.34674072265625, "logps/rejected": -653.3511352539062, "loss": 0.2666, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6848056316375732, "rewards/margins": 2.7814083099365234, "rewards/margins_max": 4.369351387023926, "rewards/margins_min": 1.1934659481048584, "rewards/margins_std": 2.245690107345581, "rewards/rejected": -4.466213703155518, "step": 950 }, { "epoch": 0.4, "grad_norm": 0.7421875, "learning_rate": 1.5140449348041133e-06, "logits/chosen": 0.15041589736938477, "logits/rejected": 0.7314427495002747, "logps/chosen": -418.79571533203125, "logps/rejected": -602.0596923828125, "loss": 0.3008, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6542141437530518, "rewards/margins": 2.1634926795959473, "rewards/margins_max": 3.671837568283081, "rewards/margins_min": 0.6551474928855896, "rewards/margins_std": 2.133122205734253, "rewards/rejected": -3.817707061767578, "step": 960 }, { "epoch": 0.4, "grad_norm": 0.88671875, "learning_rate": 1.5016593084484188e-06, "logits/chosen": 0.006452396512031555, "logits/rejected": 0.6475186347961426, "logps/chosen": -441.0831604003906, "logps/rejected": -657.4892578125, "loss": 0.3244, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5441867113113403, "rewards/margins": 2.7112343311309814, "rewards/margins_max": 4.389430999755859, "rewards/margins_min": 1.0330379009246826, "rewards/margins_std": 2.373328685760498, "rewards/rejected": -4.2554216384887695, "step": 970 }, { "epoch": 0.4, "grad_norm": 1.21875, "learning_rate": 1.4891699774364925e-06, "logits/chosen": -0.04653478413820267, "logits/rejected": 0.585922122001648, "logps/chosen": -463.1758728027344, "logps/rejected": -709.259765625, "loss": 0.2991, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.9931418895721436, "rewards/margins": 2.939608097076416, "rewards/margins_max": 4.592248439788818, "rewards/margins_min": 1.2869676351547241, "rewards/margins_std": 2.337186336517334, "rewards/rejected": -4.932750225067139, "step": 980 }, { "epoch": 0.41, "grad_norm": 1.109375, "learning_rate": 1.4765795236037705e-06, "logits/chosen": 0.11729402840137482, "logits/rejected": 0.6863471269607544, "logps/chosen": -499.93182373046875, "logps/rejected": -771.2833251953125, "loss": 0.2888, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.091594696044922, "rewards/margins": 3.2571346759796143, "rewards/margins_max": 5.196051597595215, "rewards/margins_min": 1.3182172775268555, "rewards/margins_std": 2.7420434951782227, "rewards/rejected": -5.348729133605957, "step": 990 }, { "epoch": 0.41, "grad_norm": 0.6875, "learning_rate": 1.463890549690129e-06, "logits/chosen": -0.0073835537768900394, "logits/rejected": 0.5565542578697205, "logps/chosen": -399.1741638183594, "logps/rejected": -644.0343017578125, "loss": 0.3183, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6458442211151123, "rewards/margins": 2.509960412979126, "rewards/margins_max": 3.9676921367645264, "rewards/margins_min": 1.0522279739379883, "rewards/margins_std": 2.061544418334961, "rewards/rejected": -4.155804634094238, "step": 1000 }, { "epoch": 0.42, "grad_norm": 0.89453125, "learning_rate": 1.4511056788018387e-06, "logits/chosen": 0.03009922243654728, "logits/rejected": 0.7003548741340637, "logps/chosen": -442.25994873046875, "logps/rejected": -612.8889770507812, "loss": 0.3125, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.7707973718643188, "rewards/margins": 2.1549880504608154, "rewards/margins_max": 3.36381196975708, "rewards/margins_min": 0.9461652636528015, "rewards/margins_std": 1.7095340490341187, "rewards/rejected": -3.925785541534424, "step": 1010 }, { "epoch": 0.42, "grad_norm": 2.546875, "learning_rate": 1.438227553869307e-06, "logits/chosen": 0.08984429389238358, "logits/rejected": 0.6354426145553589, "logps/chosen": -430.12957763671875, "logps/rejected": -771.4137573242188, "loss": 0.2714, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.9618059396743774, "rewards/margins": 3.78582763671875, "rewards/margins_max": 6.3192243576049805, "rewards/margins_min": 1.25243079662323, "rewards/margins_std": 3.5827643871307373, "rewards/rejected": -5.747633457183838, "step": 1020 }, { "epoch": 0.42, "grad_norm": 3.859375, "learning_rate": 1.4252588371007226e-06, "logits/chosen": 0.038023028522729874, "logits/rejected": 0.7126880884170532, "logps/chosen": -465.6451721191406, "logps/rejected": -643.1898803710938, "loss": 0.3392, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.6084156036376953, "rewards/margins": 2.4245221614837646, "rewards/margins_max": 4.242518424987793, "rewards/margins_min": 0.6065254807472229, "rewards/margins_std": 2.571035861968994, "rewards/rejected": -4.032937526702881, "step": 1030 }, { "epoch": 0.43, "grad_norm": 0.77734375, "learning_rate": 1.412202209431716e-06, "logits/chosen": 0.014304918237030506, "logits/rejected": 0.6482391357421875, "logps/chosen": -429.71514892578125, "logps/rejected": -718.8599243164062, "loss": 0.2488, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7444334030151367, "rewards/margins": 3.125716209411621, "rewards/margins_max": 5.137998580932617, "rewards/margins_min": 1.1134343147277832, "rewards/margins_std": 2.845796823501587, "rewards/rejected": -4.870149612426758, "step": 1040 }, { "epoch": 0.43, "grad_norm": 6.40625, "learning_rate": 1.3990603699711468e-06, "logits/chosen": 0.13320419192314148, "logits/rejected": 0.7178879976272583, "logps/chosen": -432.60931396484375, "logps/rejected": -785.9718017578125, "loss": 0.2916, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9751720428466797, "rewards/margins": 3.503911256790161, "rewards/margins_max": 5.681495666503906, "rewards/margins_min": 1.3263267278671265, "rewards/margins_std": 3.0795693397521973, "rewards/rejected": -5.479083061218262, "step": 1050 }, { "epoch": 0.44, "grad_norm": 4.1875, "learning_rate": 1.3858360354431353e-06, "logits/chosen": -0.04740985855460167, "logits/rejected": 0.6082924008369446, "logps/chosen": -461.19854736328125, "logps/rejected": -738.2750244140625, "loss": 0.3134, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0386252403259277, "rewards/margins": 3.0969669818878174, "rewards/margins_max": 4.9145002365112305, "rewards/margins_min": 1.2794336080551147, "rewards/margins_std": 2.570380449295044, "rewards/rejected": -5.135591983795166, "step": 1060 }, { "epoch": 0.44, "grad_norm": 0.6171875, "learning_rate": 1.3725319396254528e-06, "logits/chosen": 0.06939555704593658, "logits/rejected": 0.721479058265686, "logps/chosen": -423.41082763671875, "logps/rejected": -740.5270385742188, "loss": 0.279, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.8382021188735962, "rewards/margins": 3.333662509918213, "rewards/margins_max": 5.442681312561035, "rewards/margins_min": 1.2246429920196533, "rewards/margins_std": 2.9826035499572754, "rewards/rejected": -5.171864032745361, "step": 1070 }, { "epoch": 0.44, "grad_norm": 0.71875, "learning_rate": 1.3591508327843857e-06, "logits/chosen": 0.05436503142118454, "logits/rejected": 0.669663667678833, "logps/chosen": -454.3190002441406, "logps/rejected": -752.100830078125, "loss": 0.2395, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.265321969985962, "rewards/margins": 3.1536312103271484, "rewards/margins_max": 4.983828544616699, "rewards/margins_min": 1.3234339952468872, "rewards/margins_std": 2.588289737701416, "rewards/rejected": -5.418953895568848, "step": 1080 }, { "epoch": 0.45, "grad_norm": 2.21875, "learning_rate": 1.3456954811061907e-06, "logits/chosen": 0.13526487350463867, "logits/rejected": 0.5835285782814026, "logps/chosen": -445.73199462890625, "logps/rejected": -693.9923706054688, "loss": 0.3059, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.054556131362915, "rewards/margins": 2.580620288848877, "rewards/margins_max": 4.460507869720459, "rewards/margins_min": 0.7007322907447815, "rewards/margins_std": 2.6585628986358643, "rewards/rejected": -4.635176658630371, "step": 1090 }, { "epoch": 0.45, "grad_norm": 0.6796875, "learning_rate": 1.3321686661252624e-06, "logits/chosen": -0.02377261593937874, "logits/rejected": 0.5029060244560242, "logps/chosen": -429.89276123046875, "logps/rejected": -790.6046142578125, "loss": 0.2515, "rewards/accuracies": 0.9375, "rewards/chosen": -2.046562910079956, "rewards/margins": 3.737776517868042, "rewards/margins_max": 6.0582804679870605, "rewards/margins_min": 1.4172735214233398, "rewards/margins_std": 3.2816872596740723, "rewards/rejected": -5.78433895111084, "step": 1100 }, { "epoch": 0.46, "grad_norm": 0.79296875, "learning_rate": 1.3185731841491217e-06, "logits/chosen": -0.05110805109143257, "logits/rejected": 0.5351995229721069, "logps/chosen": -491.8407287597656, "logps/rejected": -814.8461303710938, "loss": 0.3023, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.366361141204834, "rewards/margins": 3.5456528663635254, "rewards/margins_max": 5.71653938293457, "rewards/margins_min": 1.3747665882110596, "rewards/margins_std": 3.070096969604492, "rewards/rejected": -5.912014484405518, "step": 1110 }, { "epoch": 0.46, "grad_norm": 1.09375, "learning_rate": 1.3049118456803566e-06, "logits/chosen": 0.06179860979318619, "logits/rejected": 0.6705427169799805, "logps/chosen": -449.32220458984375, "logps/rejected": -707.3748779296875, "loss": 0.3074, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1201488971710205, "rewards/margins": 3.0255444049835205, "rewards/margins_max": 4.990096569061279, "rewards/margins_min": 1.0609924793243408, "rewards/margins_std": 2.7782959938049316, "rewards/rejected": -5.145693778991699, "step": 1120 }, { "epoch": 0.47, "grad_norm": 1.03125, "learning_rate": 1.2911874748356252e-06, "logits/chosen": -0.028266632929444313, "logits/rejected": 0.6430469751358032, "logps/chosen": -443.5455017089844, "logps/rejected": -795.4043579101562, "loss": 0.2608, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1805038452148438, "rewards/margins": 3.5634307861328125, "rewards/margins_max": 5.326353073120117, "rewards/margins_min": 1.800508737564087, "rewards/margins_std": 2.4931483268737793, "rewards/rejected": -5.743934631347656, "step": 1130 }, { "epoch": 0.47, "grad_norm": 0.66796875, "learning_rate": 1.2774029087618445e-06, "logits/chosen": -0.06501082330942154, "logits/rejected": 0.5853902101516724, "logps/chosen": -521.4249877929688, "logps/rejected": -714.2313232421875, "loss": 0.2854, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.3054754734039307, "rewards/margins": 2.6544671058654785, "rewards/margins_max": 4.386072635650635, "rewards/margins_min": 0.9228616952896118, "rewards/margins_std": 2.448859691619873, "rewards/rejected": -4.959942817687988, "step": 1140 }, { "epoch": 0.47, "grad_norm": 1.6328125, "learning_rate": 1.263560997049687e-06, "logits/chosen": -0.01226266659796238, "logits/rejected": 0.6223964691162109, "logps/chosen": -500.0978088378906, "logps/rejected": -738.5953369140625, "loss": 0.2613, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.160482883453369, "rewards/margins": 3.0736801624298096, "rewards/margins_max": 4.78936767578125, "rewards/margins_min": 1.3579928874969482, "rewards/margins_std": 2.4263482093811035, "rewards/rejected": -5.234162330627441, "step": 1150 }, { "epoch": 0.48, "grad_norm": 1.4140625, "learning_rate": 1.2496646011445024e-06, "logits/chosen": 0.13887211680412292, "logits/rejected": 0.6845839619636536, "logps/chosen": -482.53924560546875, "logps/rejected": -763.1187744140625, "loss": 0.2828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2161526679992676, "rewards/margins": 3.281670331954956, "rewards/margins_max": 5.1992340087890625, "rewards/margins_min": 1.36410653591156, "rewards/margins_std": 2.7118449211120605, "rewards/rejected": -5.497823715209961, "step": 1160 }, { "epoch": 0.48, "grad_norm": 0.87890625, "learning_rate": 1.2357165937547932e-06, "logits/chosen": 0.11630807816982269, "logits/rejected": 0.8136134147644043, "logps/chosen": -416.43377685546875, "logps/rejected": -637.5525512695312, "loss": 0.3023, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.602250099182129, "rewards/margins": 2.4919466972351074, "rewards/margins_max": 4.262382507324219, "rewards/margins_min": 0.7215104103088379, "rewards/margins_std": 2.503774642944336, "rewards/rejected": -4.0941972732543945, "step": 1170 }, { "epoch": 0.49, "grad_norm": 1.1015625, "learning_rate": 1.2217198582583553e-06, "logits/chosen": 0.1416541039943695, "logits/rejected": 0.6444225907325745, "logps/chosen": -453.56085205078125, "logps/rejected": -785.13623046875, "loss": 0.3186, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.104236602783203, "rewards/margins": 3.521073818206787, "rewards/margins_max": 6.118127822875977, "rewards/margins_min": 0.9240198135375977, "rewards/margins_std": 3.6727893352508545, "rewards/rejected": -5.62531042098999, "step": 1180 }, { "epoch": 0.49, "grad_norm": 1.203125, "learning_rate": 1.20767728810622e-06, "logits/chosen": 0.098558709025383, "logits/rejected": 0.7595298886299133, "logps/chosen": -484.1192932128906, "logps/rejected": -835.5067138671875, "loss": 0.2177, "rewards/accuracies": 0.9375, "rewards/chosen": -2.26068115234375, "rewards/margins": 3.935065746307373, "rewards/margins_max": 5.916754722595215, "rewards/margins_min": 1.9533783197402954, "rewards/margins_std": 2.802530288696289, "rewards/rejected": -6.195747375488281, "step": 1190 }, { "epoch": 0.49, "grad_norm": 1.2578125, "learning_rate": 1.1935917862245069e-06, "logits/chosen": 0.009665842168033123, "logits/rejected": 0.692471981048584, "logps/chosen": -446.05364990234375, "logps/rejected": -773.1881103515625, "loss": 0.3002, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.023267984390259, "rewards/margins": 3.7675445079803467, "rewards/margins_max": 6.028232574462891, "rewards/margins_min": 1.506856083869934, "rewards/margins_std": 3.197096347808838, "rewards/rejected": -5.7908124923706055, "step": 1200 }, { "epoch": 0.5, "grad_norm": 0.7890625, "learning_rate": 1.1794662644143256e-06, "logits/chosen": 0.022773366421461105, "logits/rejected": 0.6556586027145386, "logps/chosen": -499.6419982910156, "logps/rejected": -979.7394409179688, "loss": 0.2683, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6731841564178467, "rewards/margins": 5.120915412902832, "rewards/margins_max": 8.691125869750977, "rewards/margins_min": 1.5507053136825562, "rewards/margins_std": 5.049039840698242, "rewards/rejected": -7.7940993309021, "step": 1210 }, { "epoch": 0.5, "grad_norm": 0.9453125, "learning_rate": 1.1653036427498352e-06, "logits/chosen": 0.06103574112057686, "logits/rejected": 0.5934966206550598, "logps/chosen": -460.0519104003906, "logps/rejected": -799.0787353515625, "loss": 0.2657, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.192873239517212, "rewards/margins": 3.3739676475524902, "rewards/margins_max": 5.504680156707764, "rewards/margins_min": 1.2432544231414795, "rewards/margins_std": 3.0132834911346436, "rewards/rejected": -5.566840648651123, "step": 1220 }, { "epoch": 0.51, "grad_norm": 0.75390625, "learning_rate": 1.1511068489745986e-06, "logits/chosen": 0.07468974590301514, "logits/rejected": 0.8071237802505493, "logps/chosen": -477.79571533203125, "logps/rejected": -913.1275634765625, "loss": 0.249, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.136107921600342, "rewards/margins": 4.6890764236450195, "rewards/margins_max": 7.924353122711182, "rewards/margins_min": 1.4538004398345947, "rewards/margins_std": 4.575371742248535, "rewards/rejected": -6.8251848220825195, "step": 1230 }, { "epoch": 0.51, "grad_norm": 0.63671875, "learning_rate": 1.1368788178963491e-06, "logits/chosen": 0.06184614449739456, "logits/rejected": 0.6528698205947876, "logps/chosen": -485.544189453125, "logps/rejected": -858.0904541015625, "loss": 0.253, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4247965812683105, "rewards/margins": 3.964677333831787, "rewards/margins_max": 5.929955959320068, "rewards/margins_min": 1.9993988275527954, "rewards/margins_std": 2.7793235778808594, "rewards/rejected": -6.389473915100098, "step": 1240 }, { "epoch": 0.51, "grad_norm": 1.2109375, "learning_rate": 1.1226224907802983e-06, "logits/chosen": 0.134813591837883, "logits/rejected": 0.5963112115859985, "logps/chosen": -488.6258239746094, "logps/rejected": -963.7542724609375, "loss": 0.2625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.413456439971924, "rewards/margins": 5.094055652618408, "rewards/margins_max": 8.218836784362793, "rewards/margins_min": 1.9692729711532593, "rewards/margins_std": 4.419109344482422, "rewards/rejected": -7.507512092590332, "step": 1250 }, { "epoch": 0.52, "grad_norm": 3.046875, "learning_rate": 1.1083408147411073e-06, "logits/chosen": 0.2207455337047577, "logits/rejected": 0.8767908811569214, "logps/chosen": -470.7706604003906, "logps/rejected": -718.1776123046875, "loss": 0.2267, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.117788791656494, "rewards/margins": 3.199702024459839, "rewards/margins_max": 5.136730670928955, "rewards/margins_min": 1.2626738548278809, "rewards/margins_std": 2.7393720149993896, "rewards/rejected": -5.317490577697754, "step": 1260 }, { "epoch": 0.52, "grad_norm": 1.40625, "learning_rate": 1.0940367421336488e-06, "logits/chosen": 0.10231053829193115, "logits/rejected": 0.6745079755783081, "logps/chosen": -482.825927734375, "logps/rejected": -798.8470458984375, "loss": 0.2976, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.555229663848877, "rewards/margins": 3.457376480102539, "rewards/margins_max": 5.613675594329834, "rewards/margins_min": 1.301077127456665, "rewards/margins_std": 3.0494678020477295, "rewards/rejected": -6.012606143951416, "step": 1270 }, { "epoch": 0.53, "grad_norm": 3.0625, "learning_rate": 1.079713229942688e-06, "logits/chosen": 0.14812633395195007, "logits/rejected": 0.7574166059494019, "logps/chosen": -506.6991271972656, "logps/rejected": -787.7262573242188, "loss": 0.2928, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6476612091064453, "rewards/margins": 3.1967766284942627, "rewards/margins_max": 5.269640922546387, "rewards/margins_min": 1.1239116191864014, "rewards/margins_std": 2.931473731994629, "rewards/rejected": -5.844437599182129, "step": 1280 }, { "epoch": 0.53, "grad_norm": 1.21875, "learning_rate": 1.0653732391716053e-06, "logits/chosen": 0.12779296934604645, "logits/rejected": 0.6562881469726562, "logps/chosen": -452.0044860839844, "logps/rejected": -755.9925537109375, "loss": 0.2875, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2710297107696533, "rewards/margins": 3.172046184539795, "rewards/margins_max": 5.278564929962158, "rewards/margins_min": 1.0655282735824585, "rewards/margins_std": 2.9790663719177246, "rewards/rejected": -5.443076133728027, "step": 1290 }, { "epoch": 0.54, "grad_norm": 0.88671875, "learning_rate": 1.0510197342302864e-06, "logits/chosen": 0.15161243081092834, "logits/rejected": 0.683529257774353, "logps/chosen": -446.7079162597656, "logps/rejected": -819.0192260742188, "loss": 0.3016, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.081666946411133, "rewards/margins": 3.9550280570983887, "rewards/margins_max": 6.584043025970459, "rewards/margins_min": 1.3260126113891602, "rewards/margins_std": 3.717989444732666, "rewards/rejected": -6.036694526672363, "step": 1300 }, { "epoch": 0.54, "grad_norm": 1.28125, "learning_rate": 1.0366556823223101e-06, "logits/chosen": 0.20373359322547913, "logits/rejected": 0.6877504587173462, "logps/chosen": -456.41015625, "logps/rejected": -850.4407958984375, "loss": 0.2665, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.4085049629211426, "rewards/margins": 4.007854461669922, "rewards/margins_max": 6.256316184997559, "rewards/margins_min": 1.7593927383422852, "rewards/margins_std": 3.1798055171966553, "rewards/rejected": -6.416359901428223, "step": 1310 }, { "epoch": 0.54, "grad_norm": 1.53125, "learning_rate": 1.02228405283156e-06, "logits/chosen": -0.020195502787828445, "logits/rejected": 0.6102248430252075, "logps/chosen": -493.6954040527344, "logps/rejected": -811.216064453125, "loss": 0.2313, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.36457896232605, "rewards/margins": 3.4458298683166504, "rewards/margins_max": 5.659657955169678, "rewards/margins_min": 1.2320020198822021, "rewards/margins_std": 3.130825996398926, "rewards/rejected": -5.810408592224121, "step": 1320 }, { "epoch": 0.55, "grad_norm": 0.8828125, "learning_rate": 1.0079078167083814e-06, "logits/chosen": 0.16562719643115997, "logits/rejected": 0.8193610906600952, "logps/chosen": -526.57763671875, "logps/rejected": -813.3748168945312, "loss": 0.2867, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.440490245819092, "rewards/margins": 3.532073497772217, "rewards/margins_max": 5.491944789886475, "rewards/margins_min": 1.5722014904022217, "rewards/margins_std": 2.7716774940490723, "rewards/rejected": -5.972563743591309, "step": 1330 }, { "epoch": 0.55, "grad_norm": 1.1171875, "learning_rate": 9.935299458554181e-07, "logits/chosen": -0.01127061527222395, "logits/rejected": 0.612311601638794, "logps/chosen": -485.59979248046875, "logps/rejected": -858.2190551757812, "loss": 0.2744, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3641650676727295, "rewards/margins": 4.078332901000977, "rewards/margins_max": 6.7388482093811035, "rewards/margins_min": 1.4178178310394287, "rewards/margins_std": 3.7625365257263184, "rewards/rejected": -6.442498683929443, "step": 1340 }, { "epoch": 0.56, "grad_norm": 2.671875, "learning_rate": 9.791534125132508e-07, "logits/chosen": 0.03967234492301941, "logits/rejected": 0.7782914042472839, "logps/chosen": -573.2506713867188, "logps/rejected": -899.2452392578125, "loss": 0.2705, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7396163940429688, "rewards/margins": 4.099890232086182, "rewards/margins_max": 6.30058479309082, "rewards/margins_min": 1.8991953134536743, "rewards/margins_std": 3.1122524738311768, "rewards/rejected": -6.839505672454834, "step": 1350 }, { "epoch": 0.56, "grad_norm": 0.83984375, "learning_rate": 9.64781188645965e-07, "logits/chosen": 0.10714595019817352, "logits/rejected": 0.7092632055282593, "logps/chosen": -544.226806640625, "logps/rejected": -752.5347290039062, "loss": 0.3475, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.7692503929138184, "rewards/margins": 2.6476550102233887, "rewards/margins_max": 4.169407844543457, "rewards/margins_min": 1.125902533531189, "rewards/margins_std": 2.152083396911621, "rewards/rejected": -5.416905403137207, "step": 1360 }, { "epoch": 0.56, "grad_norm": 1.328125, "learning_rate": 9.504162453267776e-07, "logits/chosen": -0.025841986760497093, "logits/rejected": 0.49216756224632263, "logps/chosen": -457.9346618652344, "logps/rejected": -994.6002807617188, "loss": 0.2278, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.288224458694458, "rewards/margins": 5.407242774963379, "rewards/margins_max": 8.10576343536377, "rewards/margins_min": 2.70872163772583, "rewards/margins_std": 3.8162853717803955, "rewards/rejected": -7.695467472076416, "step": 1370 }, { "epoch": 0.57, "grad_norm": 0.9453125, "learning_rate": 9.360615521238475e-07, "logits/chosen": 0.24673600494861603, "logits/rejected": 0.7878357172012329, "logps/chosen": -515.7044067382812, "logps/rejected": -769.2423706054688, "loss": 0.2768, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.598513603210449, "rewards/margins": 2.9555516242980957, "rewards/margins_max": 4.985101699829102, "rewards/margins_min": 0.9260021448135376, "rewards/margins_std": 2.8702168464660645, "rewards/rejected": -5.554066181182861, "step": 1380 }, { "epoch": 0.57, "grad_norm": 0.56640625, "learning_rate": 9.217200764863956e-07, "logits/chosen": 0.13058429956436157, "logits/rejected": 0.7461265325546265, "logps/chosen": -489.1720275878906, "logps/rejected": -916.9035034179688, "loss": 0.2409, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4763474464416504, "rewards/margins": 4.656981468200684, "rewards/margins_max": 7.720522880554199, "rewards/margins_min": 1.5934394598007202, "rewards/margins_std": 4.332502365112305, "rewards/rejected": -7.133328914642334, "step": 1390 }, { "epoch": 0.58, "grad_norm": 4.125, "learning_rate": 9.073947831312634e-07, "logits/chosen": 0.19845367968082428, "logits/rejected": 0.6116907000541687, "logps/chosen": -449.885498046875, "logps/rejected": -832.296875, "loss": 0.2708, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.4242727756500244, "rewards/margins": 3.8387694358825684, "rewards/margins_max": 6.290716648101807, "rewards/margins_min": 1.386821985244751, "rewards/margins_std": 3.4675774574279785, "rewards/rejected": -6.263042449951172, "step": 1400 }, { "epoch": 0.58, "grad_norm": 0.71875, "learning_rate": 8.930886334300395e-07, "logits/chosen": 0.06111987307667732, "logits/rejected": 0.71096271276474, "logps/chosen": -545.9169921875, "logps/rejected": -853.5006103515625, "loss": 0.2583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.761983633041382, "rewards/margins": 3.652026653289795, "rewards/margins_max": 6.0150017738342285, "rewards/margins_min": 1.2890517711639404, "rewards/margins_std": 3.3417510986328125, "rewards/rejected": -6.414010047912598, "step": 1410 }, { "epoch": 0.58, "grad_norm": 1.3125, "learning_rate": 8.78804584796872e-07, "logits/chosen": 0.035090453922748566, "logits/rejected": 0.6259672045707703, "logps/chosen": -474.5460510253906, "logps/rejected": -847.3800048828125, "loss": 0.2675, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.222008466720581, "rewards/margins": 3.796189785003662, "rewards/margins_max": 6.324049949645996, "rewards/margins_min": 1.2683302164077759, "rewards/margins_std": 3.5749340057373047, "rewards/rejected": -6.018198013305664, "step": 1420 }, { "epoch": 0.59, "grad_norm": 2.6875, "learning_rate": 8.645455900771052e-07, "logits/chosen": 0.11879072338342667, "logits/rejected": 0.7143687009811401, "logps/chosen": -543.6822509765625, "logps/rejected": -943.021484375, "loss": 0.2134, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0677709579467773, "rewards/margins": 4.426670551300049, "rewards/margins_max": 7.0038580894470215, "rewards/margins_min": 1.8494832515716553, "rewards/margins_std": 3.644692897796631, "rewards/rejected": -7.494442939758301, "step": 1430 }, { "epoch": 0.59, "grad_norm": 0.86328125, "learning_rate": 8.503145969368561e-07, "logits/chosen": 0.0862460657954216, "logits/rejected": 0.5994366407394409, "logps/chosen": -504.5082092285156, "logps/rejected": -958.7373046875, "loss": 0.2234, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5698065757751465, "rewards/margins": 4.695794105529785, "rewards/margins_max": 7.5340986251831055, "rewards/margins_min": 1.857489824295044, "rewards/margins_std": 4.013968467712402, "rewards/rejected": -7.265600681304932, "step": 1440 }, { "epoch": 0.6, "grad_norm": 1.2578125, "learning_rate": 8.361145472536617e-07, "logits/chosen": 0.148963063955307, "logits/rejected": 0.7144413590431213, "logps/chosen": -518.219970703125, "logps/rejected": -832.0569458007812, "loss": 0.2983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5618433952331543, "rewards/margins": 3.5098252296447754, "rewards/margins_max": 5.876803398132324, "rewards/margins_min": 1.1428462266921997, "rewards/margins_std": 3.3474135398864746, "rewards/rejected": -6.071669101715088, "step": 1450 }, { "epoch": 0.6, "grad_norm": 0.83984375, "learning_rate": 8.219483765083293e-07, "logits/chosen": 0.03291046619415283, "logits/rejected": 0.5989701151847839, "logps/chosen": -541.6590576171875, "logps/rejected": -917.2067260742188, "loss": 0.2071, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0608572959899902, "rewards/margins": 4.072875022888184, "rewards/margins_max": 6.147627830505371, "rewards/margins_min": 1.9981224536895752, "rewards/margins_std": 2.934143304824829, "rewards/rejected": -7.133731842041016, "step": 1460 }, { "epoch": 0.61, "grad_norm": 0.703125, "learning_rate": 8.078190131780982e-07, "logits/chosen": 0.10352887213230133, "logits/rejected": 0.5709009766578674, "logps/chosen": -459.8164978027344, "logps/rejected": -901.9567260742188, "loss": 0.211, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.477700710296631, "rewards/margins": 4.241365909576416, "rewards/margins_max": 6.99854040145874, "rewards/margins_min": 1.4841907024383545, "rewards/margins_std": 3.8992340564727783, "rewards/rejected": -6.719066619873047, "step": 1470 }, { "epoch": 0.61, "grad_norm": 1.1953125, "learning_rate": 7.9372937813126e-07, "logits/chosen": 0.08338715136051178, "logits/rejected": 0.7014255523681641, "logps/chosen": -550.0524291992188, "logps/rejected": -882.2293090820312, "loss": 0.2304, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9287476539611816, "rewards/margins": 3.599247455596924, "rewards/margins_max": 5.723820686340332, "rewards/margins_min": 1.4746736288070679, "rewards/margins_std": 3.004601001739502, "rewards/rejected": -6.5279951095581055, "step": 1480 }, { "epoch": 0.61, "grad_norm": 2.6875, "learning_rate": 7.796823840233442e-07, "logits/chosen": 0.04040234535932541, "logits/rejected": 0.7684676051139832, "logps/chosen": -565.4691772460938, "logps/rejected": -789.9411010742188, "loss": 0.3451, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.7227869033813477, "rewards/margins": 2.916978359222412, "rewards/margins_max": 4.791220664978027, "rewards/margins_min": 1.0427358150482178, "rewards/margins_std": 2.6505794525146484, "rewards/rejected": -5.639765739440918, "step": 1490 }, { "epoch": 0.62, "grad_norm": 1.3828125, "learning_rate": 7.656809346950066e-07, "logits/chosen": 0.04201055318117142, "logits/rejected": 0.6305156946182251, "logps/chosen": -497.396240234375, "logps/rejected": -962.208984375, "loss": 0.2567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.571776866912842, "rewards/margins": 4.9352216720581055, "rewards/margins_max": 8.420190811157227, "rewards/margins_min": 1.450252890586853, "rewards/margins_std": 4.928489685058594, "rewards/rejected": -7.506998538970947, "step": 1500 }, { "epoch": 0.62, "grad_norm": 1.046875, "learning_rate": 7.517279245717367e-07, "logits/chosen": 0.11693109571933746, "logits/rejected": 0.6607118844985962, "logps/chosen": -458.0367126464844, "logps/rejected": -978.6554565429688, "loss": 0.2966, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4729740619659424, "rewards/margins": 5.092294216156006, "rewards/margins_max": 8.261327743530273, "rewards/margins_min": 1.9232604503631592, "rewards/margins_std": 4.481690406799316, "rewards/rejected": -7.565268039703369, "step": 1510 }, { "epoch": 0.63, "grad_norm": 1.1875, "learning_rate": 7.378262380655118e-07, "logits/chosen": 0.055606938898563385, "logits/rejected": 0.7165523171424866, "logps/chosen": -509.15155029296875, "logps/rejected": -901.2708740234375, "loss": 0.2279, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.534592390060425, "rewards/margins": 4.365265846252441, "rewards/margins_max": 7.091165065765381, "rewards/margins_min": 1.6393667459487915, "rewards/margins_std": 3.8550033569335938, "rewards/rejected": -6.899857997894287, "step": 1520 }, { "epoch": 0.63, "grad_norm": 1.6484375, "learning_rate": 7.239787489785247e-07, "logits/chosen": 0.1286771148443222, "logits/rejected": 0.7139743566513062, "logps/chosen": -519.9439086914062, "logps/rejected": -866.30078125, "loss": 0.2376, "rewards/accuracies": 0.9375, "rewards/chosen": -2.677018404006958, "rewards/margins": 4.071950912475586, "rewards/margins_max": 7.0152387619018555, "rewards/margins_min": 1.1286628246307373, "rewards/margins_std": 4.16243839263916, "rewards/rejected": -6.748970031738281, "step": 1530 }, { "epoch": 0.63, "grad_norm": 0.75, "learning_rate": 7.101883199090987e-07, "logits/chosen": 0.1824348419904709, "logits/rejected": 0.6094152331352234, "logps/chosen": -533.3997802734375, "logps/rejected": -980.6068115234375, "loss": 0.1974, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.984278440475464, "rewards/margins": 4.562946796417236, "rewards/margins_max": 7.375452518463135, "rewards/margins_min": 1.7504408359527588, "rewards/margins_std": 3.9774837493896484, "rewards/rejected": -7.547224998474121, "step": 1540 }, { "epoch": 0.64, "grad_norm": 1.1015625, "learning_rate": 6.964578016599238e-07, "logits/chosen": 0.1528719961643219, "logits/rejected": 0.7347079515457153, "logps/chosen": -511.112060546875, "logps/rejected": -1012.8218994140625, "loss": 0.2411, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.4701285362243652, "rewards/margins": 5.128227233886719, "rewards/margins_max": 7.814431667327881, "rewards/margins_min": 2.4420230388641357, "rewards/margins_std": 3.7988662719726562, "rewards/rejected": -7.598355770111084, "step": 1550 }, { "epoch": 0.64, "grad_norm": 0.87890625, "learning_rate": 6.827900326487286e-07, "logits/chosen": 0.12157417833805084, "logits/rejected": 0.8184272646903992, "logps/chosen": -529.847900390625, "logps/rejected": -1091.0225830078125, "loss": 0.2172, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8752691745758057, "rewards/margins": 5.952185153961182, "rewards/margins_max": 9.909235000610352, "rewards/margins_min": 1.995133399963379, "rewards/margins_std": 5.596114635467529, "rewards/rejected": -8.82745361328125, "step": 1560 }, { "epoch": 0.65, "grad_norm": 3.078125, "learning_rate": 6.691878383215141e-07, "logits/chosen": 0.09048546850681305, "logits/rejected": 0.5541390776634216, "logps/chosen": -566.58349609375, "logps/rejected": -987.8902587890625, "loss": 0.3041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2955403327941895, "rewards/margins": 4.377233028411865, "rewards/margins_max": 6.9765167236328125, "rewards/margins_min": 1.7779486179351807, "rewards/margins_std": 3.6759426593780518, "rewards/rejected": -7.672772407531738, "step": 1570 }, { "epoch": 0.65, "grad_norm": 2.84375, "learning_rate": 6.556540305684669e-07, "logits/chosen": -0.011808687821030617, "logits/rejected": 0.6870378851890564, "logps/chosen": -546.3944091796875, "logps/rejected": -931.5330200195312, "loss": 0.2546, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.909376621246338, "rewards/margins": 4.274510860443115, "rewards/margins_max": 6.635194301605225, "rewards/margins_min": 1.913827896118164, "rewards/margins_std": 3.338510513305664, "rewards/rejected": -7.183887481689453, "step": 1580 }, { "epoch": 0.65, "grad_norm": 1.6015625, "learning_rate": 6.421914071426778e-07, "logits/chosen": -0.04181584715843201, "logits/rejected": 0.5624270439147949, "logps/chosen": -541.0858154296875, "logps/rejected": -971.2429809570312, "loss": 0.3229, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.9971630573272705, "rewards/margins": 4.618222236633301, "rewards/margins_max": 7.345058441162109, "rewards/margins_min": 1.891385793685913, "rewards/margins_std": 3.8563284873962402, "rewards/rejected": -7.615384578704834, "step": 1590 }, { "epoch": 0.66, "grad_norm": 0.63671875, "learning_rate": 6.288027510817791e-07, "logits/chosen": 0.15334565937519073, "logits/rejected": 0.8467614054679871, "logps/chosen": -588.7959594726562, "logps/rejected": -1015.6530151367188, "loss": 0.238, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.3042540550231934, "rewards/margins": 4.745846748352051, "rewards/margins_max": 7.398883819580078, "rewards/margins_min": 2.092808246612549, "rewards/margins_std": 3.751962184906006, "rewards/rejected": -8.050100326538086, "step": 1600 }, { "epoch": 0.66, "grad_norm": 0.86328125, "learning_rate": 6.154908301326289e-07, "logits/chosen": 0.07106464356184006, "logits/rejected": 0.6679781675338745, "logps/chosen": -521.3948364257812, "logps/rejected": -967.5283203125, "loss": 0.3055, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4520983695983887, "rewards/margins": 4.829917907714844, "rewards/margins_max": 7.988490104675293, "rewards/margins_min": 1.6713447570800781, "rewards/margins_std": 4.4668965339660645, "rewards/rejected": -7.282015800476074, "step": 1610 }, { "epoch": 0.67, "grad_norm": 1.0859375, "learning_rate": 6.022583961791494e-07, "logits/chosen": 0.06975733488798141, "logits/rejected": 0.6143258810043335, "logps/chosen": -547.225341796875, "logps/rejected": -929.7529296875, "loss": 0.2615, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8573384284973145, "rewards/margins": 4.046209335327148, "rewards/margins_max": 6.497920036315918, "rewards/margins_min": 1.5944980382919312, "rewards/margins_std": 3.467243194580078, "rewards/rejected": -6.903547763824463, "step": 1620 }, { "epoch": 0.67, "grad_norm": 4.28125, "learning_rate": 5.891081846734518e-07, "logits/chosen": 0.03396327421069145, "logits/rejected": 0.680204451084137, "logps/chosen": -582.3570556640625, "logps/rejected": -1022.9677734375, "loss": 0.2667, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.1391189098358154, "rewards/margins": 4.894112586975098, "rewards/margins_max": 7.7569475173950195, "rewards/margins_min": 2.0312764644622803, "rewards/margins_std": 4.048661231994629, "rewards/rejected": -8.033230781555176, "step": 1630 }, { "epoch": 0.68, "grad_norm": 1.5, "learning_rate": 5.760429140703533e-07, "logits/chosen": 0.1480637490749359, "logits/rejected": 0.6867285370826721, "logps/chosen": -511.46832275390625, "logps/rejected": -906.4397583007812, "loss": 0.2314, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.893805980682373, "rewards/margins": 4.286385536193848, "rewards/margins_max": 6.519837856292725, "rewards/margins_min": 2.052934169769287, "rewards/margins_std": 3.1585774421691895, "rewards/rejected": -7.180192470550537, "step": 1640 }, { "epoch": 0.68, "grad_norm": 2.3125, "learning_rate": 5.63065285265409e-07, "logits/chosen": 0.016267577186226845, "logits/rejected": 0.6522185206413269, "logps/chosen": -537.4530029296875, "logps/rejected": -844.8182373046875, "loss": 0.31, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0314884185791016, "rewards/margins": 3.5092055797576904, "rewards/margins_max": 5.994574546813965, "rewards/margins_min": 1.0238367319107056, "rewards/margins_std": 3.514842987060547, "rewards/rejected": -6.540694236755371, "step": 1650 }, { "epoch": 0.68, "grad_norm": 1.2421875, "learning_rate": 5.501779810365744e-07, "logits/chosen": 0.10497574508190155, "logits/rejected": 0.6778794527053833, "logps/chosen": -606.8820190429688, "logps/rejected": -939.759765625, "loss": 0.2511, "rewards/accuracies": 0.9375, "rewards/chosen": -3.223036527633667, "rewards/margins": 3.837195873260498, "rewards/margins_max": 6.3507466316223145, "rewards/margins_min": 1.323644757270813, "rewards/margins_std": 3.5546982288360596, "rewards/rejected": -7.060232639312744, "step": 1660 }, { "epoch": 0.69, "grad_norm": 4.71875, "learning_rate": 5.373836654896127e-07, "logits/chosen": 0.21054425835609436, "logits/rejected": 0.7104513645172119, "logps/chosen": -551.0523071289062, "logps/rejected": -927.8137817382812, "loss": 0.2092, "rewards/accuracies": 0.9375, "rewards/chosen": -2.922978162765503, "rewards/margins": 4.11228084564209, "rewards/margins_max": 6.838304042816162, "rewards/margins_min": 1.3862587213516235, "rewards/margins_std": 3.855178117752075, "rewards/rejected": -7.035260200500488, "step": 1670 }, { "epoch": 0.69, "grad_norm": 3.4375, "learning_rate": 5.246849835073623e-07, "logits/chosen": 0.22590751945972443, "logits/rejected": 0.653782844543457, "logps/chosen": -501.31475830078125, "logps/rejected": -810.4156494140625, "loss": 0.2812, "rewards/accuracies": 0.9375, "rewards/chosen": -2.845599412918091, "rewards/margins": 3.173737049102783, "rewards/margins_max": 4.869901657104492, "rewards/margins_min": 1.4775731563568115, "rewards/margins_std": 2.398738384246826, "rewards/rejected": -6.019336700439453, "step": 1680 }, { "epoch": 0.7, "grad_norm": 0.80859375, "learning_rate": 5.120845602029775e-07, "logits/chosen": 0.1999841332435608, "logits/rejected": 0.7564531564712524, "logps/chosen": -498.1185607910156, "logps/rejected": -965.1281127929688, "loss": 0.1513, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7307372093200684, "rewards/margins": 4.529786586761475, "rewards/margins_max": 6.7064385414123535, "rewards/margins_min": 2.3531341552734375, "rewards/margins_std": 3.0782508850097656, "rewards/rejected": -7.260523796081543, "step": 1690 }, { "epoch": 0.7, "grad_norm": 1.2109375, "learning_rate": 4.995850003772563e-07, "logits/chosen": 0.1419237107038498, "logits/rejected": 0.6594254970550537, "logps/chosen": -526.7871704101562, "logps/rejected": -946.35400390625, "loss": 0.2515, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.9440999031066895, "rewards/margins": 4.337802886962891, "rewards/margins_max": 7.0636725425720215, "rewards/margins_min": 1.6119331121444702, "rewards/margins_std": 3.854962110519409, "rewards/rejected": -7.281902313232422, "step": 1700 }, { "epoch": 0.7, "grad_norm": 2.875, "learning_rate": 4.871888879801684e-07, "logits/chosen": 0.1439136564731598, "logits/rejected": 0.6816688776016235, "logps/chosen": -516.1966552734375, "logps/rejected": -867.43115234375, "loss": 0.2889, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.8488383293151855, "rewards/margins": 3.843712568283081, "rewards/margins_max": 6.232985019683838, "rewards/margins_min": 1.4544405937194824, "rewards/margins_std": 3.378941297531128, "rewards/rejected": -6.692551612854004, "step": 1710 }, { "epoch": 0.71, "grad_norm": 7.5625, "learning_rate": 4.7489878557669236e-07, "logits/chosen": 0.1883472502231598, "logits/rejected": 0.6354864835739136, "logps/chosen": -522.1241455078125, "logps/rejected": -909.0133666992188, "loss": 0.2573, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9454784393310547, "rewards/margins": 4.083091735839844, "rewards/margins_max": 7.010195732116699, "rewards/margins_min": 1.1559871435165405, "rewards/margins_std": 4.139551639556885, "rewards/rejected": -7.028570652008057, "step": 1720 }, { "epoch": 0.71, "grad_norm": 2.875, "learning_rate": 4.6271723381707204e-07, "logits/chosen": -0.010915858671069145, "logits/rejected": 0.5235196352005005, "logps/chosen": -535.598876953125, "logps/rejected": -1009.626953125, "loss": 0.2645, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0873775482177734, "rewards/margins": 4.849740028381348, "rewards/margins_max": 7.774973392486572, "rewards/margins_min": 1.9245054721832275, "rewards/margins_std": 4.136904716491699, "rewards/rejected": -7.937117099761963, "step": 1730 }, { "epoch": 0.72, "grad_norm": 3.203125, "learning_rate": 4.5064675091160777e-07, "logits/chosen": -0.08643798530101776, "logits/rejected": 0.6465299725532532, "logps/chosen": -634.9217529296875, "logps/rejected": -949.5773315429688, "loss": 0.5499, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.7211272716522217, "rewards/margins": 3.7824363708496094, "rewards/margins_max": 6.287593841552734, "rewards/margins_min": 1.2772791385650635, "rewards/margins_std": 3.5428271293640137, "rewards/rejected": -7.503562927246094, "step": 1740 }, { "epoch": 0.72, "grad_norm": 0.94921875, "learning_rate": 4.386898321100817e-07, "logits/chosen": 0.16737410426139832, "logits/rejected": 0.7846443057060242, "logps/chosen": -571.4229736328125, "logps/rejected": -1064.5064697265625, "loss": 0.2579, "rewards/accuracies": 0.9375, "rewards/chosen": -3.09366774559021, "rewards/margins": 5.503853797912598, "rewards/margins_max": 9.53122615814209, "rewards/margins_min": 1.476481318473816, "rewards/margins_std": 5.695565223693848, "rewards/rejected": -8.59752082824707, "step": 1750 }, { "epoch": 0.72, "grad_norm": 0.6796875, "learning_rate": 4.268489491859335e-07, "logits/chosen": 0.14373087882995605, "logits/rejected": 0.619976818561554, "logps/chosen": -571.4114990234375, "logps/rejected": -1039.7352294921875, "loss": 0.2761, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.399026870727539, "rewards/margins": 4.906405448913574, "rewards/margins_max": 7.806390285491943, "rewards/margins_min": 2.006420850753784, "rewards/margins_std": 4.101198196411133, "rewards/rejected": -8.30543327331543, "step": 1760 }, { "epoch": 0.73, "grad_norm": 3.421875, "learning_rate": 4.151265499252841e-07, "logits/chosen": 0.10607640445232391, "logits/rejected": 0.7857314944267273, "logps/chosen": -559.5725708007812, "logps/rejected": -937.8776245117188, "loss": 0.2363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.8877224922180176, "rewards/margins": 4.257446765899658, "rewards/margins_max": 6.9602532386779785, "rewards/margins_min": 1.554640293121338, "rewards/margins_std": 3.82234525680542, "rewards/rejected": -7.145169258117676, "step": 1770 }, { "epoch": 0.73, "grad_norm": 1.328125, "learning_rate": 4.0352505762092436e-07, "logits/chosen": 0.0463348887860775, "logits/rejected": 0.6185473799705505, "logps/chosen": -528.8240356445312, "logps/rejected": -927.42822265625, "loss": 0.227, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7649338245391846, "rewards/margins": 4.2219719886779785, "rewards/margins_max": 6.481306552886963, "rewards/margins_min": 1.962636947631836, "rewards/margins_std": 3.1951823234558105, "rewards/rejected": -6.986905574798584, "step": 1780 }, { "epoch": 0.74, "grad_norm": 1.7734375, "learning_rate": 3.920468705713629e-07, "logits/chosen": 0.15475311875343323, "logits/rejected": 0.6595792174339294, "logps/chosen": -510.0287170410156, "logps/rejected": -1023.1057739257812, "loss": 0.242, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.653700113296509, "rewards/margins": 5.281457424163818, "rewards/margins_max": 8.370940208435059, "rewards/margins_min": 2.191974639892578, "rewards/margins_std": 4.3691887855529785, "rewards/rejected": -7.93515682220459, "step": 1790 }, { "epoch": 0.74, "grad_norm": 1.4296875, "learning_rate": 3.8069436158504163e-07, "logits/chosen": 0.11404214799404144, "logits/rejected": 0.6743693351745605, "logps/chosen": -570.0484619140625, "logps/rejected": -1008.6219482421875, "loss": 0.2144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.973473072052002, "rewards/margins": 4.615906715393066, "rewards/margins_max": 6.867118835449219, "rewards/margins_min": 2.3646950721740723, "rewards/margins_std": 3.183694362640381, "rewards/rejected": -7.58937931060791, "step": 1800 }, { "epoch": 0.75, "grad_norm": 0.99609375, "learning_rate": 3.6946987748982196e-07, "logits/chosen": -0.07372093200683594, "logits/rejected": 0.47911280393600464, "logps/chosen": -601.9035034179688, "logps/rejected": -1067.716064453125, "loss": 0.248, "rewards/accuracies": 0.9375, "rewards/chosen": -3.395397186279297, "rewards/margins": 4.819011211395264, "rewards/margins_max": 7.499534606933594, "rewards/margins_min": 2.1384873390197754, "rewards/margins_std": 3.790832996368408, "rewards/rejected": -8.214407920837402, "step": 1810 }, { "epoch": 0.75, "grad_norm": 1.7109375, "learning_rate": 3.5837573864783886e-07, "logits/chosen": -0.05284532159566879, "logits/rejected": 0.6970399618148804, "logps/chosen": -556.5332641601562, "logps/rejected": -932.3153076171875, "loss": 0.2203, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8831331729888916, "rewards/margins": 4.284741401672363, "rewards/margins_max": 6.570008277893066, "rewards/margins_min": 1.9994745254516602, "rewards/margins_std": 3.2318553924560547, "rewards/rejected": -7.167874813079834, "step": 1820 }, { "epoch": 0.75, "grad_norm": 1.234375, "learning_rate": 3.4741423847583127e-07, "logits/chosen": 0.2800007462501526, "logits/rejected": 0.9037263989448547, "logps/chosen": -551.2252807617188, "logps/rejected": -1097.2529296875, "loss": 0.2205, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.862596035003662, "rewards/margins": 5.937441825866699, "rewards/margins_max": 9.964279174804688, "rewards/margins_min": 1.9106025695800781, "rewards/margins_std": 5.694809913635254, "rewards/rejected": -8.800037384033203, "step": 1830 }, { "epoch": 0.76, "grad_norm": 1.734375, "learning_rate": 3.365876429710366e-07, "logits/chosen": 0.026043016463518143, "logits/rejected": 0.6701821088790894, "logps/chosen": -580.5740356445312, "logps/rejected": -984.2448120117188, "loss": 0.227, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0298023223876953, "rewards/margins": 4.495330333709717, "rewards/margins_max": 7.44777774810791, "rewards/margins_min": 1.5428824424743652, "rewards/margins_std": 4.175391674041748, "rewards/rejected": -7.525132179260254, "step": 1840 }, { "epoch": 0.76, "grad_norm": 1.375, "learning_rate": 3.2589819024275744e-07, "logits/chosen": 0.14268314838409424, "logits/rejected": 0.7200323939323425, "logps/chosen": -548.6128540039062, "logps/rejected": -1005.54736328125, "loss": 0.2673, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.04203462600708, "rewards/margins": 4.95261812210083, "rewards/margins_max": 7.634843349456787, "rewards/margins_min": 2.2703919410705566, "rewards/margins_std": 3.7932395935058594, "rewards/rejected": -7.99465274810791, "step": 1850 }, { "epoch": 0.77, "grad_norm": 1.3515625, "learning_rate": 3.1534809004969186e-07, "logits/chosen": 0.04652264714241028, "logits/rejected": 0.6002200841903687, "logps/chosen": -528.8763427734375, "logps/rejected": -903.3206787109375, "loss": 0.2614, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9814038276672363, "rewards/margins": 4.039307594299316, "rewards/margins_max": 6.4738287925720215, "rewards/margins_min": 1.6047871112823486, "rewards/margins_std": 3.442932605743408, "rewards/rejected": -7.020711421966553, "step": 1860 }, { "epoch": 0.77, "grad_norm": 1.921875, "learning_rate": 3.049395233431259e-07, "logits/chosen": 0.07690130174160004, "logits/rejected": 0.6124612092971802, "logps/chosen": -497.55072021484375, "logps/rejected": -863.54052734375, "loss": 0.2795, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6492507457733154, "rewards/margins": 3.8400909900665283, "rewards/margins_max": 6.673186302185059, "rewards/margins_min": 1.0069960355758667, "rewards/margins_std": 4.006600856781006, "rewards/rejected": -6.489341735839844, "step": 1870 }, { "epoch": 0.77, "grad_norm": 1.3515625, "learning_rate": 2.946746418160787e-07, "logits/chosen": 0.10635950416326523, "logits/rejected": 0.8063497543334961, "logps/chosen": -539.3719482421875, "logps/rejected": -899.5970458984375, "loss": 0.222, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.922797441482544, "rewards/margins": 3.8881256580352783, "rewards/margins_max": 6.072175025939941, "rewards/margins_min": 1.7040764093399048, "rewards/margins_std": 3.088712215423584, "rewards/rejected": -6.810922145843506, "step": 1880 }, { "epoch": 0.78, "grad_norm": 0.66015625, "learning_rate": 2.8455556745849905e-07, "logits/chosen": 0.22083833813667297, "logits/rejected": 0.6882720589637756, "logps/chosen": -562.1170043945312, "logps/rejected": -982.6512451171875, "loss": 0.2326, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0150840282440186, "rewards/margins": 4.539645671844482, "rewards/margins_max": 7.401745796203613, "rewards/margins_min": 1.6775459051132202, "rewards/margins_std": 4.04762077331543, "rewards/rejected": -7.554730415344238, "step": 1890 }, { "epoch": 0.78, "grad_norm": 1.1953125, "learning_rate": 2.745843921185991e-07, "logits/chosen": 0.10289929062128067, "logits/rejected": 0.6530742049217224, "logps/chosen": -491.39947509765625, "logps/rejected": -889.4318237304688, "loss": 0.1988, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5300493240356445, "rewards/margins": 4.14646053314209, "rewards/margins_max": 6.6356048583984375, "rewards/margins_min": 1.6573164463043213, "rewards/margins_std": 3.520181179046631, "rewards/rejected": -6.676509857177734, "step": 1900 }, { "epoch": 0.79, "grad_norm": 6.75, "learning_rate": 2.647631770704217e-07, "logits/chosen": 0.0862136036157608, "logits/rejected": 0.6793027520179749, "logps/chosen": -587.4644775390625, "logps/rejected": -1091.2603759765625, "loss": 0.3395, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.415440320968628, "rewards/margins": 5.334762096405029, "rewards/margins_max": 8.447580337524414, "rewards/margins_min": 2.2219443321228027, "rewards/margins_std": 4.4021897315979, "rewards/rejected": -8.750203132629395, "step": 1910 }, { "epoch": 0.79, "grad_norm": 3.96875, "learning_rate": 2.550939525877269e-07, "logits/chosen": 0.1680660843849182, "logits/rejected": 0.8692294955253601, "logps/chosen": -567.0643310546875, "logps/rejected": -903.1422729492188, "loss": 0.2927, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0026445388793945, "rewards/margins": 3.917665481567383, "rewards/margins_max": 6.735628604888916, "rewards/margins_min": 1.0997036695480347, "rewards/margins_std": 3.9852001667022705, "rewards/rejected": -6.920310020446777, "step": 1920 }, { "epoch": 0.79, "grad_norm": 1.15625, "learning_rate": 2.455787175242867e-07, "logits/chosen": -0.026486584916710854, "logits/rejected": 0.6239403486251831, "logps/chosen": -556.9910888671875, "logps/rejected": -1082.27490234375, "loss": 0.2382, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8589916229248047, "rewards/margins": 5.537055015563965, "rewards/margins_max": 8.529805183410645, "rewards/margins_min": 2.5443062782287598, "rewards/margins_std": 4.232387065887451, "rewards/rejected": -8.396047592163086, "step": 1930 }, { "epoch": 0.8, "grad_norm": 2.203125, "learning_rate": 2.3621943890067608e-07, "logits/chosen": 0.13468703627586365, "logits/rejected": 0.7454935908317566, "logps/chosen": -593.1705322265625, "logps/rejected": -986.3148193359375, "loss": 0.3127, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.286363124847412, "rewards/margins": 4.50880765914917, "rewards/margins_max": 7.584027290344238, "rewards/margins_min": 1.4335881471633911, "rewards/margins_std": 4.349017143249512, "rewards/rejected": -7.795170783996582, "step": 1940 }, { "epoch": 0.8, "grad_norm": 1.859375, "learning_rate": 2.2701805149764287e-07, "logits/chosen": 0.30822715163230896, "logits/rejected": 0.8703896403312683, "logps/chosen": -640.6173095703125, "logps/rejected": -919.14990234375, "loss": 0.3887, "rewards/accuracies": 0.875, "rewards/chosen": -3.50831937789917, "rewards/margins": 3.5380778312683105, "rewards/margins_max": 5.758213996887207, "rewards/margins_min": 1.3179413080215454, "rewards/margins_std": 3.139747142791748, "rewards/rejected": -7.0463972091674805, "step": 1950 }, { "epoch": 0.81, "grad_norm": 4.65625, "learning_rate": 2.1797645745614522e-07, "logits/chosen": 0.2973627746105194, "logits/rejected": 0.7991029024124146, "logps/chosen": -523.9281616210938, "logps/rejected": -1035.779541015625, "loss": 0.237, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9458510875701904, "rewards/margins": 5.273934841156006, "rewards/margins_max": 8.809687614440918, "rewards/margins_min": 1.738181710243225, "rewards/margins_std": 5.000309944152832, "rewards/rejected": -8.219786643981934, "step": 1960 }, { "epoch": 0.81, "grad_norm": 1.953125, "learning_rate": 2.090965258841334e-07, "logits/chosen": 0.11088068783283234, "logits/rejected": 0.6458398103713989, "logps/chosen": -493.0948181152344, "logps/rejected": -834.015625, "loss": 0.3476, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.821851968765259, "rewards/margins": 3.7064621448516846, "rewards/margins_max": 6.326642036437988, "rewards/margins_min": 1.08628249168396, "rewards/margins_std": 3.705493450164795, "rewards/rejected": -6.528314113616943, "step": 1970 }, { "epoch": 0.82, "grad_norm": 1.4296875, "learning_rate": 2.0038009247016317e-07, "logits/chosen": 0.10617595911026001, "logits/rejected": 0.6900930404663086, "logps/chosen": -544.4014892578125, "logps/rejected": -984.28369140625, "loss": 0.3149, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.090888738632202, "rewards/margins": 4.560122489929199, "rewards/margins_max": 7.143895149230957, "rewards/margins_min": 1.9763494729995728, "rewards/margins_std": 3.6540064811706543, "rewards/rejected": -7.651010990142822, "step": 1980 }, { "epoch": 0.82, "grad_norm": 1.046875, "learning_rate": 1.918289591039137e-07, "logits/chosen": 0.14296357333660126, "logits/rejected": 0.6799240708351135, "logps/chosen": -543.3214111328125, "logps/rejected": -1012.4182739257812, "loss": 0.2345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.016738176345825, "rewards/margins": 5.098849773406982, "rewards/margins_max": 8.255114555358887, "rewards/margins_min": 1.9425843954086304, "rewards/margins_std": 4.463633060455322, "rewards/rejected": -8.11558723449707, "step": 1990 }, { "epoch": 0.82, "grad_norm": 0.9453125, "learning_rate": 1.8344489350369775e-07, "logits/chosen": 0.078754723072052, "logits/rejected": 0.7793623805046082, "logps/chosen": -538.4935913085938, "logps/rejected": -1064.3187255859375, "loss": 0.1977, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8698158264160156, "rewards/margins": 5.53084659576416, "rewards/margins_max": 9.207681655883789, "rewards/margins_min": 1.854010820388794, "rewards/margins_std": 5.199830532073975, "rewards/rejected": -8.400662422180176, "step": 2000 }, { "epoch": 0.83, "grad_norm": 1.1171875, "learning_rate": 1.7522962885103143e-07, "logits/chosen": 0.22342924773693085, "logits/rejected": 0.8045142889022827, "logps/chosen": -557.4118041992188, "logps/rejected": -1015.7853393554688, "loss": 0.216, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8877434730529785, "rewards/margins": 5.013927459716797, "rewards/margins_max": 8.602627754211426, "rewards/margins_min": 1.425227403640747, "rewards/margins_std": 5.075188159942627, "rewards/rejected": -7.901670932769775, "step": 2010 }, { "epoch": 0.83, "grad_norm": 0.57421875, "learning_rate": 1.6718486343234627e-07, "logits/chosen": 0.22265294194221497, "logits/rejected": 0.8611626625061035, "logps/chosen": -595.16748046875, "logps/rejected": -976.0812377929688, "loss": 0.2934, "rewards/accuracies": 0.9375, "rewards/chosen": -3.374727249145508, "rewards/margins": 4.3763909339904785, "rewards/margins_max": 7.205048561096191, "rewards/margins_min": 1.547734022140503, "rewards/margins_std": 4.0003252029418945, "rewards/rejected": -7.7511186599731445, "step": 2020 }, { "epoch": 0.84, "grad_norm": 3.109375, "learning_rate": 1.5931226028791323e-07, "logits/chosen": 0.17107948660850525, "logits/rejected": 0.7077086567878723, "logps/chosen": -550.4708251953125, "logps/rejected": -1006.26318359375, "loss": 0.1998, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0446720123291016, "rewards/margins": 4.2325286865234375, "rewards/margins_max": 6.568942070007324, "rewards/margins_min": 1.8961141109466553, "rewards/margins_std": 3.3041882514953613, "rewards/rejected": -7.277200222015381, "step": 2030 }, { "epoch": 0.84, "grad_norm": 4.84375, "learning_rate": 1.516134468680532e-07, "logits/chosen": 0.18136277794837952, "logits/rejected": 0.7017911076545715, "logps/chosen": -527.5110473632812, "logps/rejected": -993.22900390625, "loss": 0.2196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.743804693222046, "rewards/margins": 4.674644470214844, "rewards/margins_max": 7.117551326751709, "rewards/margins_min": 2.231738567352295, "rewards/margins_std": 3.4547908306121826, "rewards/rejected": -7.418449401855469, "step": 2040 }, { "epoch": 0.84, "grad_norm": 0.84375, "learning_rate": 1.4409001469670613e-07, "logits/chosen": 0.09716422855854034, "logits/rejected": 0.7657040357589722, "logps/chosen": -583.7706298828125, "logps/rejected": -961.2703247070312, "loss": 0.2395, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9377636909484863, "rewards/margins": 4.50144100189209, "rewards/margins_max": 7.113333225250244, "rewards/margins_min": 1.889548897743225, "rewards/margins_std": 3.6937732696533203, "rewards/rejected": -7.439205169677734, "step": 2050 }, { "epoch": 0.85, "grad_norm": 1.453125, "learning_rate": 1.3674351904242608e-07, "logits/chosen": 0.11017270386219025, "logits/rejected": 0.6662808656692505, "logps/chosen": -541.803955078125, "logps/rejected": -1012.2199096679688, "loss": 0.2271, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0164852142333984, "rewards/margins": 4.881779670715332, "rewards/margins_max": 7.408524513244629, "rewards/margins_min": 2.3550355434417725, "rewards/margins_std": 3.5733566284179688, "rewards/rejected": -7.898265838623047, "step": 2060 }, { "epoch": 0.85, "grad_norm": 0.9140625, "learning_rate": 1.295754785968698e-07, "logits/chosen": 0.12141978740692139, "logits/rejected": 0.7523366808891296, "logps/chosen": -567.1585693359375, "logps/rejected": -1006.70166015625, "loss": 0.2292, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.1486592292785645, "rewards/margins": 4.691858768463135, "rewards/margins_max": 7.296378135681152, "rewards/margins_min": 2.087339401245117, "rewards/margins_std": 3.6833465099334717, "rewards/rejected": -7.840517997741699, "step": 2070 }, { "epoch": 0.86, "grad_norm": 1.6640625, "learning_rate": 1.2258737516084827e-07, "logits/chosen": 0.009871700778603554, "logits/rejected": 0.49689459800720215, "logps/chosen": -511.316162109375, "logps/rejected": -989.1583862304688, "loss": 0.208, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.903729200363159, "rewards/margins": 4.876491546630859, "rewards/margins_max": 7.534029960632324, "rewards/margins_min": 2.218952178955078, "rewards/margins_std": 3.7583279609680176, "rewards/rejected": -7.780220031738281, "step": 2080 }, { "epoch": 0.86, "grad_norm": 1.203125, "learning_rate": 1.1578065333800457e-07, "logits/chosen": 0.0854678601026535, "logits/rejected": 0.6038640737533569, "logps/chosen": -579.2017822265625, "logps/rejected": -1042.4700927734375, "loss": 0.3034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.3256430625915527, "rewards/margins": 4.763891696929932, "rewards/margins_max": 7.874256134033203, "rewards/margins_min": 1.6535274982452393, "rewards/margins_std": 4.398719787597656, "rewards/rejected": -8.089534759521484, "step": 2090 }, { "epoch": 0.86, "grad_norm": 2.046875, "learning_rate": 1.091567202361805e-07, "logits/chosen": -0.015169775113463402, "logits/rejected": 0.5395482778549194, "logps/chosen": -629.2236938476562, "logps/rejected": -1089.643798828125, "loss": 0.4088, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.6674251556396484, "rewards/margins": 4.9454522132873535, "rewards/margins_max": 8.529561042785645, "rewards/margins_min": 1.3613433837890625, "rewards/margins_std": 5.068695068359375, "rewards/rejected": -8.612876892089844, "step": 2100 }, { "epoch": 0.87, "grad_norm": 0.8046875, "learning_rate": 1.0271694517653395e-07, "logits/chosen": 0.17594434320926666, "logits/rejected": 0.7346351742744446, "logps/chosen": -545.63818359375, "logps/rejected": -901.31640625, "loss": 0.3593, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8124184608459473, "rewards/margins": 4.03906774520874, "rewards/margins_max": 6.741438388824463, "rewards/margins_min": 1.3366973400115967, "rewards/margins_std": 3.8217289447784424, "rewards/rejected": -6.8514862060546875, "step": 2110 }, { "epoch": 0.87, "grad_norm": 0.921875, "learning_rate": 9.646265941046916e-08, "logits/chosen": 0.10627947002649307, "logits/rejected": 0.7213876843452454, "logps/chosen": -531.1597900390625, "logps/rejected": -987.6064453125, "loss": 0.2611, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.926069974899292, "rewards/margins": 4.628062725067139, "rewards/margins_max": 7.4127936363220215, "rewards/margins_min": 1.8433315753936768, "rewards/margins_std": 3.938204526901245, "rewards/rejected": -7.554131984710693, "step": 2120 }, { "epoch": 0.88, "grad_norm": 1.015625, "learning_rate": 9.039515584443558e-08, "logits/chosen": 0.10931396484375, "logits/rejected": 0.675295889377594, "logps/chosen": -513.3177490234375, "logps/rejected": -960.24072265625, "loss": 0.2292, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.982128143310547, "rewards/margins": 4.812419891357422, "rewards/margins_max": 8.374956130981445, "rewards/margins_min": 1.2498825788497925, "rewards/margins_std": 5.0381879806518555, "rewards/rejected": -7.794547080993652, "step": 2130 }, { "epoch": 0.88, "grad_norm": 1.4765625, "learning_rate": 8.451568877265425e-08, "logits/chosen": 0.1267772912979126, "logits/rejected": 0.7309524416923523, "logps/chosen": -550.2372436523438, "logps/rejected": -1106.2911376953125, "loss": 0.285, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0835139751434326, "rewards/margins": 5.875071048736572, "rewards/margins_max": 9.52598762512207, "rewards/margins_min": 2.224155902862549, "rewards/margins_std": 5.163174629211426, "rewards/rejected": -8.958585739135742, "step": 2140 }, { "epoch": 0.89, "grad_norm": 1.2890625, "learning_rate": 7.882547361782587e-08, "logits/chosen": 0.07169238477945328, "logits/rejected": 0.6479736566543579, "logps/chosen": -541.5816650390625, "logps/rejected": -951.8138427734375, "loss": 0.2547, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.038245677947998, "rewards/margins": 4.422773838043213, "rewards/margins_max": 7.200972557067871, "rewards/margins_min": 1.6445751190185547, "rewards/margins_std": 3.9289660453796387, "rewards/rejected": -7.461019039154053, "step": 2150 }, { "epoch": 0.89, "grad_norm": 1.9375, "learning_rate": 7.332568667987482e-08, "logits/chosen": 0.19428284466266632, "logits/rejected": 0.808147132396698, "logps/chosen": -556.2681274414062, "logps/rejected": -916.072265625, "loss": 0.2601, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.920149326324463, "rewards/margins": 3.948967695236206, "rewards/margins_max": 6.100172996520996, "rewards/margins_min": 1.7977619171142578, "rewards/margins_std": 3.042264223098755, "rewards/rejected": -6.86911678314209, "step": 2160 }, { "epoch": 0.89, "grad_norm": 2.59375, "learning_rate": 6.801746489277993e-08, "logits/chosen": 0.07816837728023529, "logits/rejected": 0.6985357999801636, "logps/chosen": -507.4588928222656, "logps/rejected": -870.8380737304688, "loss": 0.3532, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.5561635494232178, "rewards/margins": 4.063462257385254, "rewards/margins_max": 6.586556911468506, "rewards/margins_min": 1.5403677225112915, "rewards/margins_std": 3.56819486618042, "rewards/rejected": -6.619626045227051, "step": 2170 }, { "epoch": 0.9, "grad_norm": 1.5078125, "learning_rate": 6.290190558954478e-08, "logits/chosen": -0.012972557917237282, "logits/rejected": 0.689728856086731, "logps/chosen": -533.2125244140625, "logps/rejected": -1008.7321166992188, "loss": 0.2846, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.664839029312134, "rewards/margins": 5.193449974060059, "rewards/margins_max": 7.740516662597656, "rewards/margins_min": 2.6463842391967773, "rewards/margins_std": 3.602095365524292, "rewards/rejected": -7.8582892417907715, "step": 2180 }, { "epoch": 0.9, "grad_norm": 8.125, "learning_rate": 5.798006627535279e-08, "logits/chosen": 0.0373331718146801, "logits/rejected": 0.6454305648803711, "logps/chosen": -633.1779174804688, "logps/rejected": -1055.0909423828125, "loss": 0.3153, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5589823722839355, "rewards/margins": 4.771185874938965, "rewards/margins_max": 7.790403842926025, "rewards/margins_min": 1.751967430114746, "rewards/margins_std": 4.269819736480713, "rewards/rejected": -8.330168724060059, "step": 2190 }, { "epoch": 0.91, "grad_norm": 2.296875, "learning_rate": 5.325296440895621e-08, "logits/chosen": 0.20634958148002625, "logits/rejected": 0.8938538432121277, "logps/chosen": -470.0528259277344, "logps/rejected": -813.2053833007812, "loss": 0.2234, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4969820976257324, "rewards/margins": 3.7381529808044434, "rewards/margins_max": 5.691822528839111, "rewards/margins_min": 1.7844839096069336, "rewards/margins_std": 2.7629055976867676, "rewards/rejected": -6.235135555267334, "step": 2200 }, { "epoch": 0.91, "grad_norm": 0.7578125, "learning_rate": 4.872157719234438e-08, "logits/chosen": 0.12718608975410461, "logits/rejected": 0.7145224213600159, "logps/chosen": -562.0367431640625, "logps/rejected": -1052.066162109375, "loss": 0.206, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0564522743225098, "rewards/margins": 5.207182884216309, "rewards/margins_max": 8.013465881347656, "rewards/margins_min": 2.4009008407592773, "rewards/margins_std": 3.9686825275421143, "rewards/rejected": -8.263635635375977, "step": 2210 }, { "epoch": 0.91, "grad_norm": 1.53125, "learning_rate": 4.438684136873217e-08, "logits/chosen": 0.08831767737865448, "logits/rejected": 0.6083860993385315, "logps/chosen": -542.2150268554688, "logps/rejected": -1053.9603271484375, "loss": 0.1925, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0658152103424072, "rewards/margins": 5.440025329589844, "rewards/margins_max": 8.818717956542969, "rewards/margins_min": 2.0613327026367188, "rewards/margins_std": 4.77819299697876, "rewards/rejected": -8.505840301513672, "step": 2220 }, { "epoch": 0.92, "grad_norm": 2.34375, "learning_rate": 4.02496530289147e-08, "logits/chosen": 0.12742793560028076, "logits/rejected": 0.7163810729980469, "logps/chosen": -584.5067749023438, "logps/rejected": -1026.2125244140625, "loss": 0.1992, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1832902431488037, "rewards/margins": 4.808717250823975, "rewards/margins_max": 7.214354515075684, "rewards/margins_min": 2.4030795097351074, "rewards/margins_std": 3.402085542678833, "rewards/rejected": -7.992007255554199, "step": 2230 }, { "epoch": 0.92, "grad_norm": 5.0625, "learning_rate": 3.6310867426023295e-08, "logits/chosen": 0.09458984434604645, "logits/rejected": 0.7976502180099487, "logps/chosen": -559.86962890625, "logps/rejected": -1006.2230224609375, "loss": 0.2743, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.063464403152466, "rewards/margins": 4.954630374908447, "rewards/margins_max": 7.5468573570251465, "rewards/margins_min": 2.3624024391174316, "rewards/margins_std": 3.665963649749756, "rewards/rejected": -8.018095016479492, "step": 2240 }, { "epoch": 0.93, "grad_norm": 2.609375, "learning_rate": 3.2571298798726e-08, "logits/chosen": 0.09177270531654358, "logits/rejected": 0.6422449946403503, "logps/chosen": -540.8308715820312, "logps/rejected": -921.3922119140625, "loss": 0.22, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0615222454071045, "rewards/margins": 4.212675094604492, "rewards/margins_max": 6.487915992736816, "rewards/margins_min": 1.9374347925186157, "rewards/margins_std": 3.2176766395568848, "rewards/rejected": -7.274197578430176, "step": 2250 }, { "epoch": 0.93, "grad_norm": 2.5, "learning_rate": 2.9031720202904008e-08, "logits/chosen": 0.150480717420578, "logits/rejected": 0.8441624641418457, "logps/chosen": -568.8901977539062, "logps/rejected": -986.38525390625, "loss": 0.2288, "rewards/accuracies": 0.9375, "rewards/chosen": -3.069028377532959, "rewards/margins": 4.737165927886963, "rewards/margins_max": 7.883784294128418, "rewards/margins_min": 1.5905473232269287, "rewards/margins_std": 4.449990749359131, "rewards/rejected": -7.8061933517456055, "step": 2260 }, { "epoch": 0.93, "grad_norm": 0.9453125, "learning_rate": 2.5692863351844175e-08, "logits/chosen": 0.08601401001214981, "logits/rejected": 0.7968393564224243, "logps/chosen": -508.9917907714844, "logps/rejected": -905.7818603515625, "loss": 0.2264, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.7462658882141113, "rewards/margins": 4.208594799041748, "rewards/margins_max": 6.139928340911865, "rewards/margins_min": 2.2772605419158936, "rewards/margins_std": 2.731318712234497, "rewards/rejected": -6.954860687255859, "step": 2270 }, { "epoch": 0.94, "grad_norm": 1.71875, "learning_rate": 2.2555418464976884e-08, "logits/chosen": 0.24783821403980255, "logits/rejected": 0.7340123057365417, "logps/chosen": -573.2374877929688, "logps/rejected": -1094.7659912109375, "loss": 0.2276, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.265376567840576, "rewards/margins": 5.4828691482543945, "rewards/margins_max": 8.924985885620117, "rewards/margins_min": 2.040750503540039, "rewards/margins_std": 4.867890357971191, "rewards/rejected": -8.748245239257812, "step": 2280 }, { "epoch": 0.94, "grad_norm": 0.95703125, "learning_rate": 1.9620034125190643e-08, "logits/chosen": 0.021796632558107376, "logits/rejected": 0.609171986579895, "logps/chosen": -615.38818359375, "logps/rejected": -1102.287353515625, "loss": 0.1806, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3852379322052, "rewards/margins": 5.42401123046875, "rewards/margins_max": 8.265124320983887, "rewards/margins_min": 2.5828983783721924, "rewards/margins_std": 4.017940044403076, "rewards/rejected": -8.809249877929688, "step": 2290 }, { "epoch": 0.95, "grad_norm": 2.421875, "learning_rate": 1.6887317144755776e-08, "logits/chosen": 0.16773036122322083, "logits/rejected": 0.7562099695205688, "logps/chosen": -571.23828125, "logps/rejected": -980.9654541015625, "loss": 0.2831, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.1232476234436035, "rewards/margins": 4.673791885375977, "rewards/margins_max": 7.8030548095703125, "rewards/margins_min": 1.544529914855957, "rewards/margins_std": 4.425445556640625, "rewards/rejected": -7.797039985656738, "step": 2300 }, { "epoch": 0.95, "grad_norm": 0.4609375, "learning_rate": 1.4357832439881868e-08, "logits/chosen": 0.08502840995788574, "logits/rejected": 0.6817075610160828, "logps/chosen": -530.4733276367188, "logps/rejected": -875.8533325195312, "loss": 0.285, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0532920360565186, "rewards/margins": 3.653752088546753, "rewards/margins_max": 6.376016616821289, "rewards/margins_min": 0.9314873814582825, "rewards/margins_std": 3.8498637676239014, "rewards/rejected": -6.7070441246032715, "step": 2310 }, { "epoch": 0.96, "grad_norm": 1.3984375, "learning_rate": 1.2032102913936525e-08, "logits/chosen": 0.08291347324848175, "logits/rejected": 0.7909741401672363, "logps/chosen": -521.2633666992188, "logps/rejected": -997.1781005859375, "loss": 0.2373, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.600470781326294, "rewards/margins": 5.073606967926025, "rewards/margins_max": 8.561439514160156, "rewards/margins_min": 1.5857731103897095, "rewards/margins_std": 4.932542324066162, "rewards/rejected": -7.674078464508057, "step": 2320 }, { "epoch": 0.96, "grad_norm": 1.0390625, "learning_rate": 9.910609349348953e-09, "logits/chosen": 0.1890447735786438, "logits/rejected": 0.7709522247314453, "logps/chosen": -516.685791015625, "logps/rejected": -850.3052978515625, "loss": 0.2563, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8008828163146973, "rewards/margins": 3.603776216506958, "rewards/margins_max": 6.171905517578125, "rewards/margins_min": 1.0356473922729492, "rewards/margins_std": 3.631882429122925, "rewards/rejected": -6.404659271240234, "step": 2330 }, { "epoch": 0.96, "grad_norm": 1.3515625, "learning_rate": 7.993790308221227e-09, "logits/chosen": 0.09116091579198837, "logits/rejected": 0.6433526873588562, "logps/chosen": -550.9741821289062, "logps/rejected": -902.2535400390625, "loss": 0.264, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5910322666168213, "rewards/margins": 3.7961928844451904, "rewards/margins_max": 5.869515419006348, "rewards/margins_min": 1.7228702306747437, "rewards/margins_std": 2.9321210384368896, "rewards/rejected": -6.387225151062012, "step": 2340 }, { "epoch": 0.97, "grad_norm": 1.21875, "learning_rate": 6.282042041667046e-09, "logits/chosen": 0.027905773371458054, "logits/rejected": 0.596718966960907, "logps/chosen": -605.1668701171875, "logps/rejected": -1050.55078125, "loss": 0.2155, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3858304023742676, "rewards/margins": 4.683663368225098, "rewards/margins_max": 7.590599060058594, "rewards/margins_min": 1.7767282724380493, "rewards/margins_std": 4.111027717590332, "rewards/rejected": -8.069494247436523, "step": 2350 }, { "epoch": 0.97, "grad_norm": 1.0078125, "learning_rate": 4.775718407897811e-09, "logits/chosen": 0.1464410424232483, "logits/rejected": 0.5830925703048706, "logps/chosen": -496.97662353515625, "logps/rejected": -940.4425048828125, "loss": 0.1986, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.711729049682617, "rewards/margins": 4.474714756011963, "rewards/margins_max": 7.022757053375244, "rewards/margins_min": 1.9266719818115234, "rewards/margins_std": 3.6034762859344482, "rewards/rejected": -7.186443328857422, "step": 2360 }, { "epoch": 0.98, "grad_norm": 2.234375, "learning_rate": 3.4751307990712466e-09, "logits/chosen": 0.14855477213859558, "logits/rejected": 0.7474950551986694, "logps/chosen": -538.835693359375, "logps/rejected": -997.9420776367188, "loss": 0.2956, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9597933292388916, "rewards/margins": 4.721034049987793, "rewards/margins_max": 7.788567543029785, "rewards/margins_min": 1.6535009145736694, "rewards/margins_std": 4.338146686553955, "rewards/rejected": -7.680828094482422, "step": 2370 }, { "epoch": 0.98, "grad_norm": 0.8203125, "learning_rate": 2.38054807692023e-09, "logits/chosen": 0.09776248037815094, "logits/rejected": 0.6629201173782349, "logps/chosen": -488.8106994628906, "logps/rejected": -883.71826171875, "loss": 0.2274, "rewards/accuracies": 0.9375, "rewards/chosen": -2.568514347076416, "rewards/margins": 4.160928249359131, "rewards/margins_max": 6.740607261657715, "rewards/margins_min": 1.5812499523162842, "rewards/margins_std": 3.6482162475585938, "rewards/rejected": -6.729442596435547, "step": 2380 }, { "epoch": 0.98, "grad_norm": 1.890625, "learning_rate": 1.4921965171720286e-09, "logits/chosen": 0.15427419543266296, "logits/rejected": 0.7093546986579895, "logps/chosen": -487.88214111328125, "logps/rejected": -885.75390625, "loss": 0.2778, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.636676788330078, "rewards/margins": 4.090733528137207, "rewards/margins_max": 7.045225620269775, "rewards/margins_min": 1.1362407207489014, "rewards/margins_std": 4.178283214569092, "rewards/rejected": -6.727410316467285, "step": 2390 }, { "epoch": 0.99, "grad_norm": 2.171875, "learning_rate": 8.102597627722696e-10, "logits/chosen": 0.16055794060230255, "logits/rejected": 0.7714218497276306, "logps/chosen": -504.7491149902344, "logps/rejected": -1007.6917724609375, "loss": 0.2977, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8060338497161865, "rewards/margins": 5.096834182739258, "rewards/margins_max": 7.9286065101623535, "rewards/margins_min": 2.265061140060425, "rewards/margins_std": 4.004731178283691, "rewards/rejected": -7.902867794036865, "step": 2400 }, { "epoch": 0.99, "grad_norm": 0.9296875, "learning_rate": 3.34878785921755e-10, "logits/chosen": 0.27159827947616577, "logits/rejected": 0.8951881527900696, "logps/chosen": -534.0177001953125, "logps/rejected": -945.98876953125, "loss": 0.2609, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.8999059200286865, "rewards/margins": 4.4170026779174805, "rewards/margins_max": 6.8793535232543945, "rewards/margins_min": 1.9546514749526978, "rewards/margins_std": 3.482290267944336, "rewards/rejected": -7.316908359527588, "step": 2410 }, { "epoch": 1.0, "grad_norm": 0.953125, "learning_rate": 6.615185893366072e-11, "logits/chosen": 0.05070207267999649, "logits/rejected": 0.7962743043899536, "logps/chosen": -517.2967529296875, "logps/rejected": -865.5984497070312, "loss": 0.2338, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6130902767181396, "rewards/margins": 4.025864124298096, "rewards/margins_max": 6.2240309715271, "rewards/margins_min": 1.8276973962783813, "rewards/margins_std": 3.1086769104003906, "rewards/rejected": -6.638954162597656, "step": 2420 }, { "epoch": 1.0, "eval_logits/chosen": 0.7758960127830505, "eval_logits/rejected": 0.9632418155670166, "eval_logps/chosen": -548.3036499023438, "eval_logps/rejected": -562.2833862304688, "eval_loss": 0.7594465613365173, "eval_rewards/accuracies": 0.5525000095367432, "eval_rewards/chosen": -2.099947452545166, "eval_rewards/margins": 0.3320136070251465, "eval_rewards/margins_max": 2.8946704864501953, "eval_rewards/margins_min": -1.5015698671340942, "eval_rewards/margins_std": 1.4200025796890259, "eval_rewards/rejected": -2.4319608211517334, "eval_runtime": 1667.9803, "eval_samples_per_second": 4.796, "eval_steps_per_second": 0.3, "step": 2428 }, { "epoch": 1.0, "step": 2428, "total_flos": 0.0, "train_loss": 0.36623305416185736, "train_runtime": 22442.9766, "train_samples_per_second": 1.731, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 2428, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }