{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 18.786390881708563, "learning_rate": 6.666666666666668e-08, "logits/chosen": -0.5580782294273376, "logits/rejected": -0.7519971132278442, "logps/chosen": -1.739689588546753, "logps/rejected": -2.5574848651885986, "loss": 1.6179, "odds_ratio_loss": 0.6882386207580566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08698447048664093, "rewards/margins": 0.040889762341976166, "rewards/rejected": -0.1278742551803589, "sft_loss": 1.739689588546753, "step": 5 }, { "epoch": 0.008, "grad_norm": 23.4203092044018, "learning_rate": 1.3333333333333336e-07, "logits/chosen": -0.33697596192359924, "logits/rejected": -0.5306824445724487, "logps/chosen": -1.5324140787124634, "logps/rejected": -1.5423911809921265, "loss": 1.6397, "odds_ratio_loss": 0.6955539584159851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07662070542573929, "rewards/margins": 0.0004988554865121841, "rewards/rejected": -0.07711955904960632, "sft_loss": 1.5324140787124634, "step": 10 }, { "epoch": 0.012, "grad_norm": 115.39311996590018, "learning_rate": 2.0000000000000002e-07, "logits/chosen": -0.46800583600997925, "logits/rejected": -0.6147341728210449, "logps/chosen": -1.223116397857666, "logps/rejected": -1.5311689376831055, "loss": 1.8705, "odds_ratio_loss": 0.5750418901443481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06115582585334778, "rewards/margins": 0.015402625314891338, "rewards/rejected": -0.07655844837427139, "sft_loss": 1.223116397857666, "step": 15 }, { "epoch": 0.016, "grad_norm": 60.195715309742106, "learning_rate": 2.666666666666667e-07, "logits/chosen": -0.699694812297821, "logits/rejected": -0.5708433985710144, "logps/chosen": -2.690779209136963, "logps/rejected": -2.1934876441955566, "loss": 1.7322, "odds_ratio_loss": 1.3645074367523193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1345389485359192, "rewards/margins": -0.02486458420753479, "rewards/rejected": -0.1096743792295456, "sft_loss": 2.690779209136963, "step": 20 }, { "epoch": 0.02, "grad_norm": 136.78792197913728, "learning_rate": 3.3333333333333335e-07, "logits/chosen": -0.4181106686592102, "logits/rejected": -0.6348497867584229, "logps/chosen": -1.2124601602554321, "logps/rejected": -1.4138513803482056, "loss": 1.8234, "odds_ratio_loss": 0.6432029008865356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.060623008757829666, "rewards/margins": 0.010069566778838634, "rewards/rejected": -0.07069256901741028, "sft_loss": 1.2124601602554321, "step": 25 }, { "epoch": 0.024, "grad_norm": 82.77355067386294, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -0.43863534927368164, "logits/rejected": -0.43456798791885376, "logps/chosen": -1.6070562601089478, "logps/rejected": -1.8459875583648682, "loss": 1.6574, "odds_ratio_loss": 0.6512861251831055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08035281300544739, "rewards/margins": 0.011946573853492737, "rewards/rejected": -0.09229937940835953, "sft_loss": 1.6070562601089478, "step": 30 }, { "epoch": 0.028, "grad_norm": 23.36137001588406, "learning_rate": 4.666666666666667e-07, "logits/chosen": -0.6327546238899231, "logits/rejected": -0.5695880055427551, "logps/chosen": -1.2451648712158203, "logps/rejected": -1.3754527568817139, "loss": 1.5305, "odds_ratio_loss": 0.7118954658508301, "rewards/accuracies": 0.5, "rewards/chosen": -0.062258243560791016, "rewards/margins": 0.006514391396194696, "rewards/rejected": -0.06877263635396957, "sft_loss": 1.2451648712158203, "step": 35 }, { "epoch": 0.032, "grad_norm": 36.18385215531129, "learning_rate": 5.333333333333335e-07, "logits/chosen": -0.5442869067192078, "logits/rejected": -0.5034470558166504, "logps/chosen": -1.5560173988342285, "logps/rejected": -1.7619720697402954, "loss": 1.4824, "odds_ratio_loss": 0.7844841480255127, "rewards/accuracies": 0.5, "rewards/chosen": -0.07780086249113083, "rewards/margins": 0.010297740809619427, "rewards/rejected": -0.08809860795736313, "sft_loss": 1.5560173988342285, "step": 40 }, { "epoch": 0.036, "grad_norm": 13.148829953179805, "learning_rate": 6.000000000000001e-07, "logits/chosen": -0.6464945673942566, "logits/rejected": -0.6559021472930908, "logps/chosen": -1.3128869533538818, "logps/rejected": -1.5577877759933472, "loss": 1.4002, "odds_ratio_loss": 0.7039626836776733, "rewards/accuracies": 0.5, "rewards/chosen": -0.06564434617757797, "rewards/margins": 0.012245049700140953, "rewards/rejected": -0.07788939774036407, "sft_loss": 1.3128869533538818, "step": 45 }, { "epoch": 0.04, "grad_norm": 31.644096608034133, "learning_rate": 6.666666666666667e-07, "logits/chosen": -0.6939803957939148, "logits/rejected": -0.9392184019088745, "logps/chosen": -1.2749038934707642, "logps/rejected": -1.8819398880004883, "loss": 1.4027, "odds_ratio_loss": 0.5089942812919617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06374519318342209, "rewards/margins": 0.030351802706718445, "rewards/rejected": -0.09409699589014053, "sft_loss": 1.2749038934707642, "step": 50 }, { "epoch": 0.044, "grad_norm": 62.42905881663458, "learning_rate": 7.333333333333334e-07, "logits/chosen": -0.8882333636283875, "logits/rejected": -0.8815867304801941, "logps/chosen": -1.6295671463012695, "logps/rejected": -1.6508338451385498, "loss": 1.53, "odds_ratio_loss": 0.8277268409729004, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.08147837221622467, "rewards/margins": 0.0010633214842528105, "rewards/rejected": -0.08254168927669525, "sft_loss": 1.6295671463012695, "step": 55 }, { "epoch": 0.048, "grad_norm": 16.729835988532628, "learning_rate": 8.000000000000001e-07, "logits/chosen": -1.0203667879104614, "logits/rejected": -0.8139181137084961, "logps/chosen": -1.0937988758087158, "logps/rejected": -1.4742703437805176, "loss": 1.3953, "odds_ratio_loss": 0.49027538299560547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05468994379043579, "rewards/margins": 0.019023580476641655, "rewards/rejected": -0.073713518679142, "sft_loss": 1.0937988758087158, "step": 60 }, { "epoch": 0.052, "grad_norm": 13.431687226067643, "learning_rate": 8.666666666666668e-07, "logits/chosen": -0.6755761504173279, "logits/rejected": -0.9210033416748047, "logps/chosen": -1.2131409645080566, "logps/rejected": -1.709071159362793, "loss": 1.3022, "odds_ratio_loss": 0.5299323201179504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.060657043009996414, "rewards/margins": 0.024796508252620697, "rewards/rejected": -0.08545355498790741, "sft_loss": 1.2131409645080566, "step": 65 }, { "epoch": 0.056, "grad_norm": 14.183853012054012, "learning_rate": 9.333333333333334e-07, "logits/chosen": -0.6897495985031128, "logits/rejected": -0.6041379570960999, "logps/chosen": -1.1399095058441162, "logps/rejected": -1.4036375284194946, "loss": 1.2925, "odds_ratio_loss": 0.5934557914733887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05699547380208969, "rewards/margins": 0.01318640448153019, "rewards/rejected": -0.07018186897039413, "sft_loss": 1.1399095058441162, "step": 70 }, { "epoch": 0.06, "grad_norm": 8.539096149562175, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -0.8109081387519836, "logits/rejected": -0.8047749400138855, "logps/chosen": -1.396490216255188, "logps/rejected": -1.7933557033538818, "loss": 1.327, "odds_ratio_loss": 0.5322818756103516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06982450932264328, "rewards/margins": 0.019843269139528275, "rewards/rejected": -0.08966778218746185, "sft_loss": 1.396490216255188, "step": 75 }, { "epoch": 0.064, "grad_norm": 16.572964152820063, "learning_rate": 1.066666666666667e-06, "logits/chosen": -0.6374383568763733, "logits/rejected": -0.795744001865387, "logps/chosen": -1.2772842645645142, "logps/rejected": -1.3515651226043701, "loss": 1.3445, "odds_ratio_loss": 0.7885932922363281, "rewards/accuracies": 0.5, "rewards/chosen": -0.06386421620845795, "rewards/margins": 0.0037140310741961002, "rewards/rejected": -0.06757824867963791, "sft_loss": 1.2772842645645142, "step": 80 }, { "epoch": 0.068, "grad_norm": 11.477331395703159, "learning_rate": 1.1333333333333334e-06, "logits/chosen": -0.655463695526123, "logits/rejected": -0.6941269636154175, "logps/chosen": -1.2637232542037964, "logps/rejected": -1.4608389139175415, "loss": 1.3153, "odds_ratio_loss": 0.628704845905304, "rewards/accuracies": 0.5, "rewards/chosen": -0.0631861612200737, "rewards/margins": 0.009855778887867928, "rewards/rejected": -0.07304193824529648, "sft_loss": 1.2637232542037964, "step": 85 }, { "epoch": 0.072, "grad_norm": 8.882050736167244, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -0.7108498215675354, "logits/rejected": -0.7867471575737, "logps/chosen": -1.346906065940857, "logps/rejected": -1.1983606815338135, "loss": 1.233, "odds_ratio_loss": 1.0082073211669922, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.06734530627727509, "rewards/margins": -0.007427269127219915, "rewards/rejected": -0.05991803854703903, "sft_loss": 1.346906065940857, "step": 90 }, { "epoch": 0.076, "grad_norm": 9.355470695768565, "learning_rate": 1.2666666666666669e-06, "logits/chosen": -0.5399163961410522, "logits/rejected": -0.7329431772232056, "logps/chosen": -1.2461333274841309, "logps/rejected": -1.0014374256134033, "loss": 1.2829, "odds_ratio_loss": 0.9972267150878906, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.06230667233467102, "rewards/margins": -0.012234793975949287, "rewards/rejected": -0.050071872770786285, "sft_loss": 1.2461333274841309, "step": 95 }, { "epoch": 0.08, "grad_norm": 21.047192553416995, "learning_rate": 1.3333333333333334e-06, "logits/chosen": -0.4293244779109955, "logits/rejected": -1.0099979639053345, "logps/chosen": -1.2779626846313477, "logps/rejected": -1.6887423992156982, "loss": 1.2277, "odds_ratio_loss": 0.48076191544532776, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06389813870191574, "rewards/margins": 0.020538978278636932, "rewards/rejected": -0.08443711698055267, "sft_loss": 1.2779626846313477, "step": 100 }, { "epoch": 0.084, "grad_norm": 7.732739294319649, "learning_rate": 1.4000000000000001e-06, "logits/chosen": -0.48744726181030273, "logits/rejected": -0.6798993349075317, "logps/chosen": -1.0932471752166748, "logps/rejected": -1.5973366498947144, "loss": 1.1933, "odds_ratio_loss": 0.6056200861930847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05466235801577568, "rewards/margins": 0.025204479694366455, "rewards/rejected": -0.07986684143543243, "sft_loss": 1.0932471752166748, "step": 105 }, { "epoch": 0.088, "grad_norm": 17.46330045101831, "learning_rate": 1.4666666666666669e-06, "logits/chosen": -0.7039790153503418, "logits/rejected": -0.4986042380332947, "logps/chosen": -1.3014500141143799, "logps/rejected": -1.3300515413284302, "loss": 1.2646, "odds_ratio_loss": 0.7541639804840088, "rewards/accuracies": 0.5, "rewards/chosen": -0.06507251411676407, "rewards/margins": 0.0014300707262009382, "rewards/rejected": -0.06650258600711823, "sft_loss": 1.3014500141143799, "step": 110 }, { "epoch": 0.092, "grad_norm": 9.681223889349653, "learning_rate": 1.5333333333333334e-06, "logits/chosen": -0.33655840158462524, "logits/rejected": -0.45631903409957886, "logps/chosen": -1.1225440502166748, "logps/rejected": -1.4931358098983765, "loss": 1.2771, "odds_ratio_loss": 0.5696986317634583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05612720176577568, "rewards/margins": 0.01852959208190441, "rewards/rejected": -0.07465679943561554, "sft_loss": 1.1225440502166748, "step": 115 }, { "epoch": 0.096, "grad_norm": 6.114934666493648, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -0.23245234787464142, "logits/rejected": -0.36162400245666504, "logps/chosen": -0.8415991067886353, "logps/rejected": -1.6995826959609985, "loss": 1.3043, "odds_ratio_loss": 0.4082559049129486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04207995533943176, "rewards/margins": 0.04289917275309563, "rewards/rejected": -0.08497913181781769, "sft_loss": 0.8415991067886353, "step": 120 }, { "epoch": 0.1, "grad_norm": 29.566253249742736, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -0.6754384636878967, "logits/rejected": -0.5784372091293335, "logps/chosen": -1.1139187812805176, "logps/rejected": -1.0610531568527222, "loss": 1.2067, "odds_ratio_loss": 0.8141271471977234, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05569593235850334, "rewards/margins": -0.002643275773152709, "rewards/rejected": -0.05305265635251999, "sft_loss": 1.1139187812805176, "step": 125 }, { "epoch": 0.104, "grad_norm": 18.345090279015356, "learning_rate": 1.7333333333333336e-06, "logits/chosen": -0.5198614597320557, "logits/rejected": -0.5959798693656921, "logps/chosen": -1.2163742780685425, "logps/rejected": -1.0906116962432861, "loss": 1.3261, "odds_ratio_loss": 0.8466520309448242, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.060818713158369064, "rewards/margins": -0.006288123317062855, "rewards/rejected": -0.054530590772628784, "sft_loss": 1.2163742780685425, "step": 130 }, { "epoch": 0.108, "grad_norm": 14.549007799191262, "learning_rate": 1.8000000000000001e-06, "logits/chosen": -0.43266358971595764, "logits/rejected": -0.566035270690918, "logps/chosen": -1.3842393159866333, "logps/rejected": -1.4916588068008423, "loss": 1.3058, "odds_ratio_loss": 0.6889979243278503, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06921195983886719, "rewards/margins": 0.005370972212404013, "rewards/rejected": -0.07458294183015823, "sft_loss": 1.3842393159866333, "step": 135 }, { "epoch": 0.112, "grad_norm": 23.355911973188775, "learning_rate": 1.8666666666666669e-06, "logits/chosen": -0.592883825302124, "logits/rejected": -0.4132818281650543, "logps/chosen": -1.100903868675232, "logps/rejected": -1.2986562252044678, "loss": 1.141, "odds_ratio_loss": 0.5977322459220886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.055045194923877716, "rewards/margins": 0.009887613356113434, "rewards/rejected": -0.06493280827999115, "sft_loss": 1.100903868675232, "step": 140 }, { "epoch": 0.116, "grad_norm": 9.702121972571497, "learning_rate": 1.9333333333333336e-06, "logits/chosen": -0.6822474598884583, "logits/rejected": -0.49178019165992737, "logps/chosen": -1.1203429698944092, "logps/rejected": -1.9260637760162354, "loss": 1.2472, "odds_ratio_loss": 0.4579285979270935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05601715296506882, "rewards/margins": 0.04028604179620743, "rewards/rejected": -0.09630318731069565, "sft_loss": 1.1203429698944092, "step": 145 }, { "epoch": 0.12, "grad_norm": 16.918077923973826, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -0.5237475633621216, "logits/rejected": -0.8934429287910461, "logps/chosen": -0.8749428987503052, "logps/rejected": -1.3101131916046143, "loss": 1.1565, "odds_ratio_loss": 0.5730590224266052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04374714195728302, "rewards/margins": 0.021758515387773514, "rewards/rejected": -0.06550566107034683, "sft_loss": 0.8749428987503052, "step": 150 }, { "epoch": 0.124, "grad_norm": 11.087436159759044, "learning_rate": 2.0666666666666666e-06, "logits/chosen": -0.5964301824569702, "logits/rejected": -0.8087562322616577, "logps/chosen": -1.5669138431549072, "logps/rejected": -1.6930387020111084, "loss": 1.3477, "odds_ratio_loss": 0.6572802066802979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07834570109844208, "rewards/margins": 0.006306241266429424, "rewards/rejected": -0.08465193212032318, "sft_loss": 1.5669138431549072, "step": 155 }, { "epoch": 0.128, "grad_norm": 11.340859017147654, "learning_rate": 2.133333333333334e-06, "logits/chosen": -0.7819768190383911, "logits/rejected": -0.4041160047054291, "logps/chosen": -1.2440321445465088, "logps/rejected": -1.2431968450546265, "loss": 1.2147, "odds_ratio_loss": 0.7579740285873413, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0622016116976738, "rewards/margins": -4.1763483750401065e-05, "rewards/rejected": -0.06215984374284744, "sft_loss": 1.2440321445465088, "step": 160 }, { "epoch": 0.132, "grad_norm": 11.290804145869599, "learning_rate": 2.2e-06, "logits/chosen": -0.3270031809806824, "logits/rejected": -0.42402464151382446, "logps/chosen": -1.2503063678741455, "logps/rejected": -1.2082438468933105, "loss": 1.3976, "odds_ratio_loss": 0.7837561368942261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06251531839370728, "rewards/margins": -0.00210312707349658, "rewards/rejected": -0.06041219085454941, "sft_loss": 1.2503063678741455, "step": 165 }, { "epoch": 0.136, "grad_norm": 8.653604673632358, "learning_rate": 2.266666666666667e-06, "logits/chosen": -0.4413372874259949, "logits/rejected": -0.7267617583274841, "logps/chosen": -1.1855711936950684, "logps/rejected": -1.0826096534729004, "loss": 1.2368, "odds_ratio_loss": 0.8635139465332031, "rewards/accuracies": 0.5, "rewards/chosen": -0.059278566390275955, "rewards/margins": -0.005148076917976141, "rewards/rejected": -0.05413048714399338, "sft_loss": 1.1855711936950684, "step": 170 }, { "epoch": 0.14, "grad_norm": 18.84252613733039, "learning_rate": 2.3333333333333336e-06, "logits/chosen": -0.31860145926475525, "logits/rejected": -0.42350611090660095, "logps/chosen": -1.1006211042404175, "logps/rejected": -1.26455557346344, "loss": 1.2382, "odds_ratio_loss": 0.6225001811981201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05503106117248535, "rewards/margins": 0.00819671992212534, "rewards/rejected": -0.06322778016328812, "sft_loss": 1.1006211042404175, "step": 175 }, { "epoch": 0.144, "grad_norm": 8.60596229262369, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -0.6520022749900818, "logits/rejected": -0.5037721991539001, "logps/chosen": -1.0799922943115234, "logps/rejected": -1.3683429956436157, "loss": 1.2163, "odds_ratio_loss": 0.6034747362136841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05399961397051811, "rewards/margins": 0.014417541213333607, "rewards/rejected": -0.06841715425252914, "sft_loss": 1.0799922943115234, "step": 180 }, { "epoch": 0.148, "grad_norm": 8.352997349835983, "learning_rate": 2.466666666666667e-06, "logits/chosen": -0.33638280630111694, "logits/rejected": -0.3540743589401245, "logps/chosen": -1.5841834545135498, "logps/rejected": -1.106182336807251, "loss": 1.274, "odds_ratio_loss": 1.2082093954086304, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.07920917868614197, "rewards/margins": -0.02390005625784397, "rewards/rejected": -0.05530911684036255, "sft_loss": 1.5841834545135498, "step": 185 }, { "epoch": 0.152, "grad_norm": 10.77421191597458, "learning_rate": 2.5333333333333338e-06, "logits/chosen": -0.35303565859794617, "logits/rejected": -0.5206754803657532, "logps/chosen": -1.4167323112487793, "logps/rejected": -1.4747750759124756, "loss": 1.2796, "odds_ratio_loss": 0.7247415781021118, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0708366185426712, "rewards/margins": 0.002902137814089656, "rewards/rejected": -0.07373875379562378, "sft_loss": 1.4167323112487793, "step": 190 }, { "epoch": 0.156, "grad_norm": 7.1177401499786415, "learning_rate": 2.6e-06, "logits/chosen": -0.2510073482990265, "logits/rejected": -0.5589274168014526, "logps/chosen": -1.442596197128296, "logps/rejected": -1.4756193161010742, "loss": 1.3657, "odds_ratio_loss": 0.7003253102302551, "rewards/accuracies": 0.5, "rewards/chosen": -0.07212980836629868, "rewards/margins": 0.0016511573921889067, "rewards/rejected": -0.07378096878528595, "sft_loss": 1.442596197128296, "step": 195 }, { "epoch": 0.16, "grad_norm": 16.763946753945877, "learning_rate": 2.666666666666667e-06, "logits/chosen": -0.45789265632629395, "logits/rejected": -0.4571969509124756, "logps/chosen": -1.0912725925445557, "logps/rejected": -1.1674692630767822, "loss": 1.3449, "odds_ratio_loss": 0.7581279873847961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05456363409757614, "rewards/margins": 0.0038098313380032778, "rewards/rejected": -0.05837346240878105, "sft_loss": 1.0912725925445557, "step": 200 }, { "epoch": 0.164, "grad_norm": 7.633189094243207, "learning_rate": 2.7333333333333336e-06, "logits/chosen": -0.5795624852180481, "logits/rejected": -0.6412222981452942, "logps/chosen": -1.0281997919082642, "logps/rejected": -1.2789350748062134, "loss": 1.2025, "odds_ratio_loss": 0.6383514404296875, "rewards/accuracies": 0.5, "rewards/chosen": -0.05140998959541321, "rewards/margins": 0.012536766938865185, "rewards/rejected": -0.06394675374031067, "sft_loss": 1.0281997919082642, "step": 205 }, { "epoch": 0.168, "grad_norm": 22.399131275217997, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -0.3538805842399597, "logits/rejected": -0.6279109120368958, "logps/chosen": -1.0936148166656494, "logps/rejected": -1.2315231561660767, "loss": 1.273, "odds_ratio_loss": 0.6472857594490051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05468074232339859, "rewards/margins": 0.0068954164162278175, "rewards/rejected": -0.06157616525888443, "sft_loss": 1.0936148166656494, "step": 210 }, { "epoch": 0.172, "grad_norm": 12.511784021141086, "learning_rate": 2.866666666666667e-06, "logits/chosen": -0.6461108326911926, "logits/rejected": -0.8027406930923462, "logps/chosen": -1.279821515083313, "logps/rejected": -1.2249929904937744, "loss": 1.1567, "odds_ratio_loss": 0.7108494639396667, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06399108469486237, "rewards/margins": -0.002741418778896332, "rewards/rejected": -0.06124965474009514, "sft_loss": 1.279821515083313, "step": 215 }, { "epoch": 0.176, "grad_norm": 18.111790971616198, "learning_rate": 2.9333333333333338e-06, "logits/chosen": -0.5222848057746887, "logits/rejected": -0.5374074578285217, "logps/chosen": -1.1454452276229858, "logps/rejected": -1.541313648223877, "loss": 1.3047, "odds_ratio_loss": 0.46772265434265137, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05727226287126541, "rewards/margins": 0.01979340985417366, "rewards/rejected": -0.07706567645072937, "sft_loss": 1.1454452276229858, "step": 220 }, { "epoch": 0.18, "grad_norm": 12.734975127781638, "learning_rate": 3e-06, "logits/chosen": -0.5977376699447632, "logits/rejected": -0.2883208096027374, "logps/chosen": -1.3275381326675415, "logps/rejected": -1.18563711643219, "loss": 1.2512, "odds_ratio_loss": 0.9130814671516418, "rewards/accuracies": 0.5, "rewards/chosen": -0.06637690961360931, "rewards/margins": -0.00709505146369338, "rewards/rejected": -0.0592818558216095, "sft_loss": 1.3275381326675415, "step": 225 }, { "epoch": 0.184, "grad_norm": 21.003312916139876, "learning_rate": 3.066666666666667e-06, "logits/chosen": -0.5674949288368225, "logits/rejected": -0.49964016675949097, "logps/chosen": -1.375934362411499, "logps/rejected": -1.5092494487762451, "loss": 1.3693, "odds_ratio_loss": 0.7147940993309021, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06879671663045883, "rewards/margins": 0.006665749941021204, "rewards/rejected": -0.0754624754190445, "sft_loss": 1.375934362411499, "step": 230 }, { "epoch": 0.188, "grad_norm": 11.904735065181235, "learning_rate": 3.133333333333334e-06, "logits/chosen": -0.6791194677352905, "logits/rejected": -0.7436596155166626, "logps/chosen": -0.8784946203231812, "logps/rejected": -1.1814043521881104, "loss": 1.248, "odds_ratio_loss": 0.5295432806015015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.043924733996391296, "rewards/margins": 0.015145489946007729, "rewards/rejected": -0.059070222079753876, "sft_loss": 0.8784946203231812, "step": 235 }, { "epoch": 0.192, "grad_norm": 15.144263155627513, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -0.39098218083381653, "logits/rejected": -0.48908573389053345, "logps/chosen": -1.4278444051742554, "logps/rejected": -1.5091187953948975, "loss": 1.3214, "odds_ratio_loss": 0.6652258634567261, "rewards/accuracies": 0.5, "rewards/chosen": -0.07139221578836441, "rewards/margins": 0.004063720349222422, "rewards/rejected": -0.0754559338092804, "sft_loss": 1.4278444051742554, "step": 240 }, { "epoch": 0.196, "grad_norm": 7.701817646797151, "learning_rate": 3.266666666666667e-06, "logits/chosen": -0.39734160900115967, "logits/rejected": -0.6708099246025085, "logps/chosen": -1.1432991027832031, "logps/rejected": -1.2521193027496338, "loss": 1.2041, "odds_ratio_loss": 0.7118695974349976, "rewards/accuracies": 0.5, "rewards/chosen": -0.057164955884218216, "rewards/margins": 0.005441013723611832, "rewards/rejected": -0.06260596960783005, "sft_loss": 1.1432991027832031, "step": 245 }, { "epoch": 0.2, "grad_norm": 9.281984255417264, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -0.3979361057281494, "logits/rejected": -0.5376076102256775, "logps/chosen": -0.9498193860054016, "logps/rejected": -1.696179986000061, "loss": 1.3049, "odds_ratio_loss": 0.5728334188461304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04749097302556038, "rewards/margins": 0.03731803596019745, "rewards/rejected": -0.08480901271104813, "sft_loss": 0.9498193860054016, "step": 250 }, { "epoch": 0.204, "grad_norm": 9.425798997764154, "learning_rate": 3.4000000000000005e-06, "logits/chosen": -0.58461594581604, "logits/rejected": -0.7774965763092041, "logps/chosen": -1.144986867904663, "logps/rejected": -1.2843310832977295, "loss": 1.3384, "odds_ratio_loss": 0.6974171996116638, "rewards/accuracies": 0.5, "rewards/chosen": -0.057249344885349274, "rewards/margins": 0.006967212073504925, "rewards/rejected": -0.06421655416488647, "sft_loss": 1.144986867904663, "step": 255 }, { "epoch": 0.208, "grad_norm": 6.829873424161352, "learning_rate": 3.4666666666666672e-06, "logits/chosen": -0.4852909445762634, "logits/rejected": -0.8059079051017761, "logps/chosen": -1.2967959642410278, "logps/rejected": -1.1163851022720337, "loss": 1.2336, "odds_ratio_loss": 0.985200047492981, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06483979523181915, "rewards/margins": -0.009020542725920677, "rewards/rejected": -0.05581926181912422, "sft_loss": 1.2967959642410278, "step": 260 }, { "epoch": 0.212, "grad_norm": 12.53260839362661, "learning_rate": 3.5333333333333335e-06, "logits/chosen": -0.8439911603927612, "logits/rejected": -0.4962303042411804, "logps/chosen": -1.4761368036270142, "logps/rejected": -1.4275569915771484, "loss": 1.2936, "odds_ratio_loss": 0.8170219659805298, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.07380684465169907, "rewards/margins": -0.0024289849679917097, "rewards/rejected": -0.07137785851955414, "sft_loss": 1.4761368036270142, "step": 265 }, { "epoch": 0.216, "grad_norm": 10.909364520509888, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -0.7634373307228088, "logits/rejected": -0.5086906552314758, "logps/chosen": -1.543953537940979, "logps/rejected": -1.3793667554855347, "loss": 1.4597, "odds_ratio_loss": 0.8944965600967407, "rewards/accuracies": 0.5, "rewards/chosen": -0.07719766348600388, "rewards/margins": -0.008229334838688374, "rewards/rejected": -0.06896833330392838, "sft_loss": 1.543953537940979, "step": 270 }, { "epoch": 0.22, "grad_norm": 14.036081295560992, "learning_rate": 3.6666666666666666e-06, "logits/chosen": -0.920630931854248, "logits/rejected": -0.5563604831695557, "logps/chosen": -1.1286569833755493, "logps/rejected": -1.3310692310333252, "loss": 1.3055, "odds_ratio_loss": 0.621965765953064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.056432850658893585, "rewards/margins": 0.010120616294443607, "rewards/rejected": -0.06655346602201462, "sft_loss": 1.1286569833755493, "step": 275 }, { "epoch": 0.224, "grad_norm": 7.314465975057194, "learning_rate": 3.7333333333333337e-06, "logits/chosen": -0.39631861448287964, "logits/rejected": -0.6269375681877136, "logps/chosen": -1.2827950716018677, "logps/rejected": -1.5547349452972412, "loss": 1.2209, "odds_ratio_loss": 0.7333937883377075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06413975358009338, "rewards/margins": 0.013596994802355766, "rewards/rejected": -0.0777367502450943, "sft_loss": 1.2827950716018677, "step": 280 }, { "epoch": 0.228, "grad_norm": 8.448913656983327, "learning_rate": 3.8000000000000005e-06, "logits/chosen": -0.465139240026474, "logits/rejected": -0.5822895169258118, "logps/chosen": -1.4464404582977295, "logps/rejected": -1.5029900074005127, "loss": 1.2867, "odds_ratio_loss": 0.6958650350570679, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07232202589511871, "rewards/margins": 0.002827476244419813, "rewards/rejected": -0.07514949887990952, "sft_loss": 1.4464404582977295, "step": 285 }, { "epoch": 0.232, "grad_norm": 5.845937122945277, "learning_rate": 3.866666666666667e-06, "logits/chosen": -0.43053460121154785, "logits/rejected": -0.5996862649917603, "logps/chosen": -1.004393458366394, "logps/rejected": -1.2151196002960205, "loss": 1.235, "odds_ratio_loss": 0.6134498715400696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05021967366337776, "rewards/margins": 0.010536303743720055, "rewards/rejected": -0.060755979269742966, "sft_loss": 1.004393458366394, "step": 290 }, { "epoch": 0.236, "grad_norm": 7.768195625026772, "learning_rate": 3.9333333333333335e-06, "logits/chosen": -0.3257526755332947, "logits/rejected": -0.6037822365760803, "logps/chosen": -2.014780282974243, "logps/rejected": -1.3312056064605713, "loss": 1.3802, "odds_ratio_loss": 1.3934136629104614, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1007390022277832, "rewards/margins": -0.034178726375103, "rewards/rejected": -0.0665602907538414, "sft_loss": 2.014780282974243, "step": 295 }, { "epoch": 0.24, "grad_norm": 7.746680686909107, "learning_rate": 4.000000000000001e-06, "logits/chosen": -0.733782947063446, "logits/rejected": -0.6740893125534058, "logps/chosen": -1.2197449207305908, "logps/rejected": -1.2922364473342896, "loss": 1.2703, "odds_ratio_loss": 0.665755569934845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06098724529147148, "rewards/margins": 0.003624574514105916, "rewards/rejected": -0.06461182236671448, "sft_loss": 1.2197449207305908, "step": 300 }, { "epoch": 0.244, "grad_norm": 13.0956632135183, "learning_rate": 4.066666666666667e-06, "logits/chosen": -0.7939602732658386, "logits/rejected": -0.3044114410877228, "logps/chosen": -1.1555862426757812, "logps/rejected": -1.36032235622406, "loss": 1.321, "odds_ratio_loss": 0.639162003993988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05777931213378906, "rewards/margins": 0.010236804373562336, "rewards/rejected": -0.06801611930131912, "sft_loss": 1.1555862426757812, "step": 305 }, { "epoch": 0.248, "grad_norm": 11.678408623894509, "learning_rate": 4.133333333333333e-06, "logits/chosen": -0.7341225743293762, "logits/rejected": -0.7172056436538696, "logps/chosen": -1.210506796836853, "logps/rejected": -1.2130941152572632, "loss": 1.3283, "odds_ratio_loss": 0.751409113407135, "rewards/accuracies": 0.5, "rewards/chosen": -0.06052535027265549, "rewards/margins": 0.00012936182611156255, "rewards/rejected": -0.06065471097826958, "sft_loss": 1.210506796836853, "step": 310 }, { "epoch": 0.252, "grad_norm": 10.485708060104384, "learning_rate": 4.2000000000000004e-06, "logits/chosen": -0.7123367190361023, "logits/rejected": -0.5732913613319397, "logps/chosen": -1.216386079788208, "logps/rejected": -1.3553041219711304, "loss": 1.2446, "odds_ratio_loss": 0.649107038974762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06081929802894592, "rewards/margins": 0.006945909466594458, "rewards/rejected": -0.06776521354913712, "sft_loss": 1.216386079788208, "step": 315 }, { "epoch": 0.256, "grad_norm": 5.890587593911549, "learning_rate": 4.266666666666668e-06, "logits/chosen": -0.5388275384902954, "logits/rejected": -0.5508286356925964, "logps/chosen": -1.1097781658172607, "logps/rejected": -1.4719569683074951, "loss": 1.1982, "odds_ratio_loss": 0.5467421412467957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05548890680074692, "rewards/margins": 0.01810893975198269, "rewards/rejected": -0.07359784096479416, "sft_loss": 1.1097781658172607, "step": 320 }, { "epoch": 0.26, "grad_norm": 16.06285773185922, "learning_rate": 4.333333333333334e-06, "logits/chosen": -0.7908880114555359, "logits/rejected": -0.20292505621910095, "logps/chosen": -2.2549867630004883, "logps/rejected": -1.2224481105804443, "loss": 1.5111, "odds_ratio_loss": 1.8231436014175415, "rewards/accuracies": 0.5, "rewards/chosen": -0.11274933815002441, "rewards/margins": -0.05162693187594414, "rewards/rejected": -0.061122406274080276, "sft_loss": 2.2549867630004883, "step": 325 }, { "epoch": 0.264, "grad_norm": 8.454262342240037, "learning_rate": 4.4e-06, "logits/chosen": -0.4531725347042084, "logits/rejected": -0.431458055973053, "logps/chosen": -1.2020156383514404, "logps/rejected": -1.3703901767730713, "loss": 1.23, "odds_ratio_loss": 0.5940185785293579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06010078266263008, "rewards/margins": 0.008418736979365349, "rewards/rejected": -0.06851951032876968, "sft_loss": 1.2020156383514404, "step": 330 }, { "epoch": 0.268, "grad_norm": 9.90145361257035, "learning_rate": 4.4666666666666665e-06, "logits/chosen": -0.5263561010360718, "logits/rejected": -0.6940143704414368, "logps/chosen": -0.9874106645584106, "logps/rejected": -1.1406878232955933, "loss": 1.2596, "odds_ratio_loss": 0.8020124435424805, "rewards/accuracies": 0.5, "rewards/chosen": -0.04937053471803665, "rewards/margins": 0.007663858123123646, "rewards/rejected": -0.05703439190983772, "sft_loss": 0.9874106645584106, "step": 335 }, { "epoch": 0.272, "grad_norm": 7.2942480975024475, "learning_rate": 4.533333333333334e-06, "logits/chosen": -0.4240873456001282, "logits/rejected": -0.6028670072555542, "logps/chosen": -1.3075716495513916, "logps/rejected": -2.017928123474121, "loss": 1.2217, "odds_ratio_loss": 0.5069721341133118, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0653785839676857, "rewards/margins": 0.035517822951078415, "rewards/rejected": -0.10089640319347382, "sft_loss": 1.3075716495513916, "step": 340 }, { "epoch": 0.276, "grad_norm": 8.866201564442973, "learning_rate": 4.600000000000001e-06, "logits/chosen": -0.7544479370117188, "logits/rejected": -0.21181067824363708, "logps/chosen": -1.218145489692688, "logps/rejected": -1.204552412033081, "loss": 1.1941, "odds_ratio_loss": 0.8569790720939636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0609072744846344, "rewards/margins": -0.0006796553498134017, "rewards/rejected": -0.06022762134671211, "sft_loss": 1.218145489692688, "step": 345 }, { "epoch": 0.28, "grad_norm": 8.192936983091547, "learning_rate": 4.666666666666667e-06, "logits/chosen": -0.32254576683044434, "logits/rejected": -0.6721733212471008, "logps/chosen": -1.1165757179260254, "logps/rejected": -1.2479455471038818, "loss": 1.2225, "odds_ratio_loss": 0.6464930176734924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05582878738641739, "rewards/margins": 0.006568485405296087, "rewards/rejected": -0.062397271394729614, "sft_loss": 1.1165757179260254, "step": 350 }, { "epoch": 0.284, "grad_norm": 13.020714692622663, "learning_rate": 4.7333333333333335e-06, "logits/chosen": -0.4439846873283386, "logits/rejected": -0.48574358224868774, "logps/chosen": -1.4092596769332886, "logps/rejected": -1.8674914836883545, "loss": 1.1975, "odds_ratio_loss": 0.6004700064659119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07046298682689667, "rewards/margins": 0.022911589592695236, "rewards/rejected": -0.0933745726943016, "sft_loss": 1.4092596769332886, "step": 355 }, { "epoch": 0.288, "grad_norm": 11.400235583060827, "learning_rate": 4.800000000000001e-06, "logits/chosen": -0.3304193615913391, "logits/rejected": -0.539570689201355, "logps/chosen": -1.2983877658843994, "logps/rejected": -1.2585299015045166, "loss": 1.2775, "odds_ratio_loss": 0.7338186502456665, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06491939723491669, "rewards/margins": -0.0019928955007344484, "rewards/rejected": -0.06292649358510971, "sft_loss": 1.2983877658843994, "step": 360 }, { "epoch": 0.292, "grad_norm": 8.196409587525967, "learning_rate": 4.866666666666667e-06, "logits/chosen": -0.4542488157749176, "logits/rejected": -0.49449652433395386, "logps/chosen": -1.022592544555664, "logps/rejected": -1.0563173294067383, "loss": 1.3053, "odds_ratio_loss": 0.7796744704246521, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05112962797284126, "rewards/margins": 0.0016862439224496484, "rewards/rejected": -0.05281587317585945, "sft_loss": 1.022592544555664, "step": 365 }, { "epoch": 0.296, "grad_norm": 10.509812724188748, "learning_rate": 4.933333333333334e-06, "logits/chosen": -0.46988096833229065, "logits/rejected": -0.31580036878585815, "logps/chosen": -1.3959112167358398, "logps/rejected": -1.6540298461914062, "loss": 1.2666, "odds_ratio_loss": 0.5971593856811523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06979555636644363, "rewards/margins": 0.012905935756862164, "rewards/rejected": -0.08270148932933807, "sft_loss": 1.3959112167358398, "step": 370 }, { "epoch": 0.3, "grad_norm": 6.579280238314472, "learning_rate": 5e-06, "logits/chosen": -0.3907073736190796, "logits/rejected": -0.5983395576477051, "logps/chosen": -1.2141754627227783, "logps/rejected": -1.3564534187316895, "loss": 1.2967, "odds_ratio_loss": 0.6791958808898926, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06070876866579056, "rewards/margins": 0.007113890256732702, "rewards/rejected": -0.06782267242670059, "sft_loss": 1.2141754627227783, "step": 375 }, { "epoch": 0.304, "grad_norm": 8.913119169147508, "learning_rate": 4.999972922944898e-06, "logits/chosen": -0.7129698395729065, "logits/rejected": -0.48502880334854126, "logps/chosen": -1.3781096935272217, "logps/rejected": -1.5335767269134521, "loss": 1.3143, "odds_ratio_loss": 0.6898760795593262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06890548765659332, "rewards/margins": 0.007773351855576038, "rewards/rejected": -0.07667883485555649, "sft_loss": 1.3781096935272217, "step": 380 }, { "epoch": 0.308, "grad_norm": 8.218029726787362, "learning_rate": 4.999891692366121e-06, "logits/chosen": -0.26474082469940186, "logits/rejected": -0.49545183777809143, "logps/chosen": -0.9910273551940918, "logps/rejected": -0.944848895072937, "loss": 1.2702, "odds_ratio_loss": 0.7946783304214478, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04955137148499489, "rewards/margins": -0.0023089235182851553, "rewards/rejected": -0.04724244400858879, "sft_loss": 0.9910273551940918, "step": 385 }, { "epoch": 0.312, "grad_norm": 7.274203044505213, "learning_rate": 4.999756310023261e-06, "logits/chosen": -0.2605069875717163, "logits/rejected": -0.868638813495636, "logps/chosen": -1.2463948726654053, "logps/rejected": -1.2507517337799072, "loss": 1.3006, "odds_ratio_loss": 0.741990327835083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06231974810361862, "rewards/margins": 0.00021784081764053553, "rewards/rejected": -0.06253758817911148, "sft_loss": 1.2463948726654053, "step": 390 }, { "epoch": 0.316, "grad_norm": 7.536549818073618, "learning_rate": 4.99956677884892e-06, "logits/chosen": -0.9248331785202026, "logits/rejected": -0.4915364384651184, "logps/chosen": -1.2074693441390991, "logps/rejected": -1.51399827003479, "loss": 1.2689, "odds_ratio_loss": 0.590670645236969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.060373466461896896, "rewards/margins": 0.015326438471674919, "rewards/rejected": -0.07569991052150726, "sft_loss": 1.2074693441390991, "step": 395 }, { "epoch": 0.32, "grad_norm": 7.302200763333058, "learning_rate": 4.999323102948655e-06, "logits/chosen": -1.0176507234573364, "logits/rejected": -0.4075535833835602, "logps/chosen": -1.2042930126190186, "logps/rejected": -1.1218703985214233, "loss": 1.3829, "odds_ratio_loss": 0.8366869688034058, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06021466106176376, "rewards/margins": -0.004121133126318455, "rewards/rejected": -0.056093525141477585, "sft_loss": 1.2042930126190186, "step": 400 }, { "epoch": 0.324, "grad_norm": 9.822662210243099, "learning_rate": 4.999025287600886e-06, "logits/chosen": -0.6442962884902954, "logits/rejected": -1.0240824222564697, "logps/chosen": -1.1904699802398682, "logps/rejected": -1.3809891939163208, "loss": 1.192, "odds_ratio_loss": 0.5962144732475281, "rewards/accuracies": 0.5, "rewards/chosen": -0.059523504227399826, "rewards/margins": 0.009525952860713005, "rewards/rejected": -0.06904946267604828, "sft_loss": 1.1904699802398682, "step": 405 }, { "epoch": 0.328, "grad_norm": 7.990885197694433, "learning_rate": 4.998673339256785e-06, "logits/chosen": -0.47288981080055237, "logits/rejected": -1.0867726802825928, "logps/chosen": -1.395422101020813, "logps/rejected": -1.6224483251571655, "loss": 1.4465, "odds_ratio_loss": 0.728722870349884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06977111101150513, "rewards/margins": 0.011351310648024082, "rewards/rejected": -0.08112241327762604, "sft_loss": 1.395422101020813, "step": 410 }, { "epoch": 0.332, "grad_norm": 22.49371675260937, "learning_rate": 4.99826726554013e-06, "logits/chosen": -0.7810731530189514, "logits/rejected": -0.6283494234085083, "logps/chosen": -1.206227421760559, "logps/rejected": -1.0297901630401611, "loss": 1.3452, "odds_ratio_loss": 0.9216625094413757, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06031137704849243, "rewards/margins": -0.008821866475045681, "rewards/rejected": -0.051489509642124176, "sft_loss": 1.206227421760559, "step": 415 }, { "epoch": 0.336, "grad_norm": 7.038901075406193, "learning_rate": 4.997807075247147e-06, "logits/chosen": -0.4292398989200592, "logits/rejected": -0.7970374226570129, "logps/chosen": -1.2103397846221924, "logps/rejected": -1.492582082748413, "loss": 1.301, "odds_ratio_loss": 0.5660519003868103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06051699072122574, "rewards/margins": 0.014112117700278759, "rewards/rejected": -0.07462911307811737, "sft_loss": 1.2103397846221924, "step": 420 }, { "epoch": 0.34, "grad_norm": 13.51899355096485, "learning_rate": 4.997292778346312e-06, "logits/chosen": -0.3543488681316376, "logits/rejected": -0.5476067662239075, "logps/chosen": -1.12038254737854, "logps/rejected": -1.4835479259490967, "loss": 1.2928, "odds_ratio_loss": 0.621757984161377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.056019127368927, "rewards/margins": 0.018158262595534325, "rewards/rejected": -0.07417739927768707, "sft_loss": 1.12038254737854, "step": 425 }, { "epoch": 0.344, "grad_norm": 18.36630863667148, "learning_rate": 4.996724385978142e-06, "logits/chosen": -0.6492398977279663, "logits/rejected": -0.38872796297073364, "logps/chosen": -1.6991106271743774, "logps/rejected": -1.4718869924545288, "loss": 1.3601, "odds_ratio_loss": 1.0099233388900757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08495552837848663, "rewards/margins": -0.011361182667315006, "rewards/rejected": -0.0735943466424942, "sft_loss": 1.6991106271743774, "step": 430 }, { "epoch": 0.348, "grad_norm": 13.64259911410447, "learning_rate": 4.996101910454953e-06, "logits/chosen": -0.5003396272659302, "logits/rejected": -0.9429534673690796, "logps/chosen": -1.7060340642929077, "logps/rejected": -1.4147275686264038, "loss": 1.3717, "odds_ratio_loss": 0.9737583994865417, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0853017121553421, "rewards/margins": -0.014565333724021912, "rewards/rejected": -0.07073638588190079, "sft_loss": 1.7060340642929077, "step": 435 }, { "epoch": 0.352, "grad_norm": 14.592917079690519, "learning_rate": 4.995425365260585e-06, "logits/chosen": -0.40251150727272034, "logits/rejected": -0.9722174406051636, "logps/chosen": -1.0912220478057861, "logps/rejected": -1.363419771194458, "loss": 1.2888, "odds_ratio_loss": 0.5721346735954285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05456110090017319, "rewards/margins": 0.01360989362001419, "rewards/rejected": -0.06817099452018738, "sft_loss": 1.0912220478057861, "step": 440 }, { "epoch": 0.356, "grad_norm": 6.937137874320563, "learning_rate": 4.994694765050121e-06, "logits/chosen": -0.8574808835983276, "logits/rejected": -0.7796967625617981, "logps/chosen": -1.3558590412139893, "logps/rejected": -1.6645358800888062, "loss": 1.2991, "odds_ratio_loss": 0.5890018939971924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06779295206069946, "rewards/margins": 0.015433847904205322, "rewards/rejected": -0.08322679251432419, "sft_loss": 1.3558590412139893, "step": 445 }, { "epoch": 0.36, "grad_norm": 13.634295128279398, "learning_rate": 4.993910125649561e-06, "logits/chosen": -0.35561490058898926, "logits/rejected": -0.9158598184585571, "logps/chosen": -1.2657678127288818, "logps/rejected": -1.4462788105010986, "loss": 1.3292, "odds_ratio_loss": 0.7243040800094604, "rewards/accuracies": 0.5, "rewards/chosen": -0.06328839808702469, "rewards/margins": 0.009025548584759235, "rewards/rejected": -0.07231394946575165, "sft_loss": 1.2657678127288818, "step": 450 }, { "epoch": 0.364, "grad_norm": 15.588418308947864, "learning_rate": 4.993071464055486e-06, "logits/chosen": -0.7908333539962769, "logits/rejected": -0.9238386154174805, "logps/chosen": -1.464706301689148, "logps/rejected": -1.4646415710449219, "loss": 1.3062, "odds_ratio_loss": 0.7673660516738892, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07323531806468964, "rewards/margins": -3.232434437450138e-06, "rewards/rejected": -0.07323208451271057, "sft_loss": 1.464706301689148, "step": 455 }, { "epoch": 0.368, "grad_norm": 7.67023021612323, "learning_rate": 4.992178798434684e-06, "logits/chosen": -0.46548470854759216, "logits/rejected": -1.2474113702774048, "logps/chosen": -0.9748779535293579, "logps/rejected": -1.329563856124878, "loss": 1.2079, "odds_ratio_loss": 0.5502606630325317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.048743896186351776, "rewards/margins": 0.01773429848253727, "rewards/rejected": -0.0664781928062439, "sft_loss": 0.9748779535293579, "step": 460 }, { "epoch": 0.372, "grad_norm": 9.396021890858217, "learning_rate": 4.9912321481237616e-06, "logits/chosen": -0.3827522397041321, "logits/rejected": -0.7908186912536621, "logps/chosen": -1.2393275499343872, "logps/rejected": -1.1069507598876953, "loss": 1.1528, "odds_ratio_loss": 0.8488165140151978, "rewards/accuracies": 0.5, "rewards/chosen": -0.06196638196706772, "rewards/margins": -0.00661883782595396, "rewards/rejected": -0.055347543209791183, "sft_loss": 1.2393275499343872, "step": 465 }, { "epoch": 0.376, "grad_norm": 15.667684557323762, "learning_rate": 4.990231533628719e-06, "logits/chosen": -0.7983174920082092, "logits/rejected": -0.942952036857605, "logps/chosen": -1.61797297000885, "logps/rejected": -1.5470950603485107, "loss": 1.3583, "odds_ratio_loss": 0.9691923260688782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08089864999055862, "rewards/margins": -0.003543901490047574, "rewards/rejected": -0.07735475152730942, "sft_loss": 1.61797297000885, "step": 470 }, { "epoch": 0.38, "grad_norm": 10.262155313138615, "learning_rate": 4.989176976624511e-06, "logits/chosen": -0.5470207929611206, "logits/rejected": -0.8008524179458618, "logps/chosen": -1.2335219383239746, "logps/rejected": -1.537672758102417, "loss": 1.3457, "odds_ratio_loss": 0.6203063130378723, "rewards/accuracies": 0.5, "rewards/chosen": -0.06167609617114067, "rewards/margins": 0.015207541175186634, "rewards/rejected": -0.07688363641500473, "sft_loss": 1.2335219383239746, "step": 475 }, { "epoch": 0.384, "grad_norm": 8.161147591651558, "learning_rate": 4.988068499954578e-06, "logits/chosen": -0.3918524384498596, "logits/rejected": -0.5939075946807861, "logps/chosen": -1.3265020847320557, "logps/rejected": -1.2360165119171143, "loss": 1.2193, "odds_ratio_loss": 0.7953775525093079, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0663251057267189, "rewards/margins": -0.004524278454482555, "rewards/rejected": -0.06180082634091377, "sft_loss": 1.3265020847320557, "step": 480 }, { "epoch": 0.388, "grad_norm": 7.500330838134909, "learning_rate": 4.986906127630346e-06, "logits/chosen": -0.4039885401725769, "logits/rejected": -0.5603979825973511, "logps/chosen": -1.1305885314941406, "logps/rejected": -1.2077453136444092, "loss": 1.31, "odds_ratio_loss": 0.710327684879303, "rewards/accuracies": 0.5, "rewards/chosen": -0.05652942508459091, "rewards/margins": 0.0038578410167247057, "rewards/rejected": -0.0603872649371624, "sft_loss": 1.1305885314941406, "step": 485 }, { "epoch": 0.392, "grad_norm": 7.430782365855567, "learning_rate": 4.985689884830711e-06, "logits/chosen": -0.5145419836044312, "logits/rejected": -0.5283325910568237, "logps/chosen": -0.8873690366744995, "logps/rejected": -1.0988765954971313, "loss": 1.2163, "odds_ratio_loss": 0.5778387784957886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.044368453323841095, "rewards/margins": 0.010575374588370323, "rewards/rejected": -0.05494382977485657, "sft_loss": 0.8873690366744995, "step": 490 }, { "epoch": 0.396, "grad_norm": 7.833399890164202, "learning_rate": 4.984419797901491e-06, "logits/chosen": -0.6590821743011475, "logits/rejected": -0.7595802545547485, "logps/chosen": -1.2602180242538452, "logps/rejected": -1.3176265954971313, "loss": 1.2691, "odds_ratio_loss": 0.8493593335151672, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06301090866327286, "rewards/margins": 0.0028704279102385044, "rewards/rejected": -0.06588132679462433, "sft_loss": 1.2602180242538452, "step": 495 }, { "epoch": 0.4, "grad_norm": 18.632214535791704, "learning_rate": 4.983095894354858e-06, "logits/chosen": -0.4741068482398987, "logits/rejected": -0.6550690531730652, "logps/chosen": -1.0877597332000732, "logps/rejected": -1.4105587005615234, "loss": 1.2655, "odds_ratio_loss": 0.6428815126419067, "rewards/accuracies": 0.5, "rewards/chosen": -0.05438799411058426, "rewards/margins": 0.01613995060324669, "rewards/rejected": -0.07052794098854065, "sft_loss": 1.0877597332000732, "step": 500 }, { "epoch": 0.404, "grad_norm": 13.093628531972609, "learning_rate": 4.981718202868738e-06, "logits/chosen": -0.37520939111709595, "logits/rejected": -0.5333417654037476, "logps/chosen": -1.2307997941970825, "logps/rejected": -1.3670756816864014, "loss": 1.3233, "odds_ratio_loss": 0.7391397356987, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06153998523950577, "rewards/margins": 0.006813795771449804, "rewards/rejected": -0.06835378706455231, "sft_loss": 1.2307997941970825, "step": 505 }, { "epoch": 0.408, "grad_norm": 32.02171447844317, "learning_rate": 4.980286753286196e-06, "logits/chosen": -1.0221941471099854, "logits/rejected": -0.6870493292808533, "logps/chosen": -1.1637510061264038, "logps/rejected": -1.1249791383743286, "loss": 1.2777, "odds_ratio_loss": 0.7576397061347961, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05818755179643631, "rewards/margins": -0.0019385985797271132, "rewards/rejected": -0.05624895542860031, "sft_loss": 1.1637510061264038, "step": 510 }, { "epoch": 0.412, "grad_norm": 8.922962337548432, "learning_rate": 4.978801576614779e-06, "logits/chosen": -0.5233002305030823, "logits/rejected": -0.7131946086883545, "logps/chosen": -1.1369407176971436, "logps/rejected": -1.2576231956481934, "loss": 1.3731, "odds_ratio_loss": 0.630257248878479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05684703588485718, "rewards/margins": 0.00603412976488471, "rewards/rejected": -0.06288117170333862, "sft_loss": 1.1369407176971436, "step": 515 }, { "epoch": 0.416, "grad_norm": 6.071002612452945, "learning_rate": 4.97726270502586e-06, "logits/chosen": -0.751970112323761, "logits/rejected": -0.5403101444244385, "logps/chosen": -1.125468373298645, "logps/rejected": -1.4324285984039307, "loss": 1.2763, "odds_ratio_loss": 0.6366511583328247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05627341940999031, "rewards/margins": 0.015348007902503014, "rewards/rejected": -0.07162143290042877, "sft_loss": 1.125468373298645, "step": 520 }, { "epoch": 0.42, "grad_norm": 19.360606964673867, "learning_rate": 4.975670171853926e-06, "logits/chosen": -0.6735143065452576, "logits/rejected": -0.5386877059936523, "logps/chosen": -1.3962455987930298, "logps/rejected": -1.3698430061340332, "loss": 1.3162, "odds_ratio_loss": 0.7852639555931091, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06981228291988373, "rewards/margins": -0.0013201329857110977, "rewards/rejected": -0.06849215179681778, "sft_loss": 1.3962455987930298, "step": 525 }, { "epoch": 0.424, "grad_norm": 12.60459466219797, "learning_rate": 4.974024011595864e-06, "logits/chosen": -0.726483941078186, "logits/rejected": -0.45278626680374146, "logps/chosen": -1.1809227466583252, "logps/rejected": -1.4415466785430908, "loss": 1.2782, "odds_ratio_loss": 0.5937660932540894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05904613807797432, "rewards/margins": 0.013031196780502796, "rewards/rejected": -0.07207732647657394, "sft_loss": 1.1809227466583252, "step": 530 }, { "epoch": 0.428, "grad_norm": 16.735047153184535, "learning_rate": 4.97232425991021e-06, "logits/chosen": -0.7554991245269775, "logits/rejected": -0.6952065825462341, "logps/chosen": -1.2135225534439087, "logps/rejected": -1.2788511514663696, "loss": 1.2871, "odds_ratio_loss": 0.6615133881568909, "rewards/accuracies": 0.5, "rewards/chosen": -0.060676127672195435, "rewards/margins": 0.0032664339523762465, "rewards/rejected": -0.0639425590634346, "sft_loss": 1.2135225534439087, "step": 535 }, { "epoch": 0.432, "grad_norm": 28.636993191829472, "learning_rate": 4.970570953616383e-06, "logits/chosen": -0.3668878972530365, "logits/rejected": -0.47576600313186646, "logps/chosen": -1.254703402519226, "logps/rejected": -1.4109866619110107, "loss": 1.3499, "odds_ratio_loss": 0.6886258125305176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0627351701259613, "rewards/margins": 0.007814161479473114, "rewards/rejected": -0.07054933160543442, "sft_loss": 1.254703402519226, "step": 540 }, { "epoch": 0.436, "grad_norm": 8.20123877269534, "learning_rate": 4.9687641306938766e-06, "logits/chosen": -0.596192479133606, "logits/rejected": -0.34622710943222046, "logps/chosen": -1.258967399597168, "logps/rejected": -1.168402910232544, "loss": 1.2366, "odds_ratio_loss": 0.9441742897033691, "rewards/accuracies": 0.5, "rewards/chosen": -0.06294836848974228, "rewards/margins": -0.004528213292360306, "rewards/rejected": -0.058420151472091675, "sft_loss": 1.258967399597168, "step": 545 }, { "epoch": 0.44, "grad_norm": 19.0599247063106, "learning_rate": 4.966903830281449e-06, "logits/chosen": -0.550312876701355, "logits/rejected": -0.7260756492614746, "logps/chosen": -0.8798558115959167, "logps/rejected": -0.9841960072517395, "loss": 1.2888, "odds_ratio_loss": 0.6650792360305786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.043992795050144196, "rewards/margins": 0.00521700968965888, "rewards/rejected": -0.049209803342819214, "sft_loss": 0.8798558115959167, "step": 550 }, { "epoch": 0.444, "grad_norm": 9.539583052659502, "learning_rate": 4.964990092676263e-06, "logits/chosen": -0.5485479235649109, "logits/rejected": -0.2632920444011688, "logps/chosen": -1.2334239482879639, "logps/rejected": -1.2576792240142822, "loss": 1.1946, "odds_ratio_loss": 0.7366870641708374, "rewards/accuracies": 0.5, "rewards/chosen": -0.06167120486497879, "rewards/margins": 0.0012127698864787817, "rewards/rejected": -0.06288396567106247, "sft_loss": 1.2334239482879639, "step": 555 }, { "epoch": 0.448, "grad_norm": 6.575286386431191, "learning_rate": 4.9630229593330226e-06, "logits/chosen": -0.5613077878952026, "logits/rejected": -0.9460631608963013, "logps/chosen": -1.4012134075164795, "logps/rejected": -1.474373698234558, "loss": 1.356, "odds_ratio_loss": 0.7623913884162903, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07006067782640457, "rewards/margins": 0.0036580085288733244, "rewards/rejected": -0.07371868193149567, "sft_loss": 1.4012134075164795, "step": 560 }, { "epoch": 0.452, "grad_norm": 8.260756058648278, "learning_rate": 4.96100247286307e-06, "logits/chosen": -0.3762677311897278, "logits/rejected": -0.681728184223175, "logps/chosen": -1.2107107639312744, "logps/rejected": -1.336549162864685, "loss": 1.2595, "odds_ratio_loss": 0.6657492518424988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06053553894162178, "rewards/margins": 0.006291926838457584, "rewards/rejected": -0.06682746857404709, "sft_loss": 1.2107107639312744, "step": 565 }, { "epoch": 0.456, "grad_norm": 5.224482900966935, "learning_rate": 4.958928677033465e-06, "logits/chosen": -0.5366466045379639, "logits/rejected": -0.2813822627067566, "logps/chosen": -1.0963077545166016, "logps/rejected": -1.9244539737701416, "loss": 1.1666, "odds_ratio_loss": 0.4927639961242676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0548153892159462, "rewards/margins": 0.04140730947256088, "rewards/rejected": -0.09622270613908768, "sft_loss": 1.0963077545166016, "step": 570 }, { "epoch": 0.46, "grad_norm": 9.231089411843929, "learning_rate": 4.956801616766033e-06, "logits/chosen": -0.41479817032814026, "logits/rejected": -1.2379553318023682, "logps/chosen": -1.3129132986068726, "logps/rejected": -1.4864296913146973, "loss": 1.2148, "odds_ratio_loss": 0.6478667259216309, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06564567983150482, "rewards/margins": 0.008675819262862206, "rewards/rejected": -0.07432149350643158, "sft_loss": 1.3129132986068726, "step": 575 }, { "epoch": 0.464, "grad_norm": 12.571799699194248, "learning_rate": 4.954621338136399e-06, "logits/chosen": -0.43359094858169556, "logits/rejected": -0.7193197011947632, "logps/chosen": -1.1483484506607056, "logps/rejected": -1.4485629796981812, "loss": 1.217, "odds_ratio_loss": 0.5974001288414001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05741741508245468, "rewards/margins": 0.015010732226073742, "rewards/rejected": -0.0724281519651413, "sft_loss": 1.1483484506607056, "step": 580 }, { "epoch": 0.468, "grad_norm": 13.14113848877051, "learning_rate": 4.9523878883729794e-06, "logits/chosen": -0.3140491545200348, "logits/rejected": -1.1171773672103882, "logps/chosen": -1.1811208724975586, "logps/rejected": -1.2877556085586548, "loss": 1.3736, "odds_ratio_loss": 0.6588929295539856, "rewards/accuracies": 0.5, "rewards/chosen": -0.05905604362487793, "rewards/margins": 0.005331738851964474, "rewards/rejected": -0.06438778340816498, "sft_loss": 1.1811208724975586, "step": 585 }, { "epoch": 0.472, "grad_norm": 6.741763650848641, "learning_rate": 4.95010131585597e-06, "logits/chosen": -0.41005969047546387, "logits/rejected": -1.1937333345413208, "logps/chosen": -1.1939094066619873, "logps/rejected": -1.519858717918396, "loss": 1.3015, "odds_ratio_loss": 0.5203220844268799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05969547480344772, "rewards/margins": 0.016297463327646255, "rewards/rejected": -0.07599293440580368, "sft_loss": 1.1939094066619873, "step": 590 }, { "epoch": 0.476, "grad_norm": 11.415912203136902, "learning_rate": 4.94776167011629e-06, "logits/chosen": -0.8332249522209167, "logits/rejected": -1.193943738937378, "logps/chosen": -1.5026148557662964, "logps/rejected": -1.3563969135284424, "loss": 1.318, "odds_ratio_loss": 0.8462100028991699, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.07513075321912766, "rewards/margins": -0.007310903165489435, "rewards/rejected": -0.06781984865665436, "sft_loss": 1.5026148557662964, "step": 595 }, { "epoch": 0.48, "grad_norm": 6.517733955264292, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -0.7536684274673462, "logits/rejected": -0.5858591794967651, "logps/chosen": -1.0818665027618408, "logps/rejected": -0.9140428304672241, "loss": 1.2555, "odds_ratio_loss": 0.847333550453186, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05409332364797592, "rewards/margins": -0.008391184732317924, "rewards/rejected": -0.045702140778303146, "sft_loss": 1.0818665027618408, "step": 600 }, { "epoch": 0.484, "grad_norm": 11.605525874938747, "learning_rate": 4.94292336283977e-06, "logits/chosen": -0.4545044004917145, "logits/rejected": -0.9466699361801147, "logps/chosen": -1.3461689949035645, "logps/rejected": -1.257264256477356, "loss": 1.2515, "odds_ratio_loss": 0.7972198724746704, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0673084557056427, "rewards/margins": -0.0044452352449297905, "rewards/rejected": -0.06286321580410004, "sft_loss": 1.3461689949035645, "step": 605 }, { "epoch": 0.488, "grad_norm": 16.623733933538222, "learning_rate": 4.940424806108619e-06, "logits/chosen": -0.44307154417037964, "logits/rejected": -1.1300188302993774, "logps/chosen": -1.5469882488250732, "logps/rejected": -1.546287178993225, "loss": 1.296, "odds_ratio_loss": 0.7446098923683167, "rewards/accuracies": 0.5, "rewards/chosen": -0.07734941691160202, "rewards/margins": -3.506392386043444e-05, "rewards/rejected": -0.0773143544793129, "sft_loss": 1.5469882488250732, "step": 610 }, { "epoch": 0.492, "grad_norm": 14.50351733733393, "learning_rate": 4.937873385763909e-06, "logits/chosen": -0.4979380667209625, "logits/rejected": -0.7332254648208618, "logps/chosen": -1.186477780342102, "logps/rejected": -1.4961540699005127, "loss": 1.2214, "odds_ratio_loss": 0.5873562097549438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05932389572262764, "rewards/margins": 0.015483811497688293, "rewards/rejected": -0.07480770349502563, "sft_loss": 1.186477780342102, "step": 615 }, { "epoch": 0.496, "grad_norm": 10.189101808498776, "learning_rate": 4.935269157073597e-06, "logits/chosen": -0.30127787590026855, "logits/rejected": -0.2239534556865692, "logps/chosen": -1.0941071510314941, "logps/rejected": -1.155369520187378, "loss": 1.2658, "odds_ratio_loss": 0.7786355018615723, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.054705359041690826, "rewards/margins": 0.0030631190165877342, "rewards/rejected": -0.057768482714891434, "sft_loss": 1.0941071510314941, "step": 620 }, { "epoch": 0.5, "grad_norm": 6.717391051471845, "learning_rate": 4.93261217644956e-06, "logits/chosen": -0.6986908316612244, "logits/rejected": -0.6756628751754761, "logps/chosen": -1.0286588668823242, "logps/rejected": -1.368683099746704, "loss": 1.2814, "odds_ratio_loss": 0.5862299203872681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05143294483423233, "rewards/margins": 0.017001213505864143, "rewards/rejected": -0.06843416392803192, "sft_loss": 1.0286588668823242, "step": 625 }, { "epoch": 0.504, "grad_norm": 4.84256770780635, "learning_rate": 4.9299025014463665e-06, "logits/chosen": -0.6707471609115601, "logits/rejected": -0.4917599558830261, "logps/chosen": -1.401749849319458, "logps/rejected": -1.291032075881958, "loss": 1.2497, "odds_ratio_loss": 0.8192909359931946, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0700874850153923, "rewards/margins": -0.005535887088626623, "rewards/rejected": -0.06455160677433014, "sft_loss": 1.401749849319458, "step": 630 }, { "epoch": 0.508, "grad_norm": 5.259599717240724, "learning_rate": 4.92714019076003e-06, "logits/chosen": -0.6145176291465759, "logits/rejected": -0.6850901246070862, "logps/chosen": -1.1875053644180298, "logps/rejected": -1.5674359798431396, "loss": 1.2434, "odds_ratio_loss": 0.4647662043571472, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.059375274926424026, "rewards/margins": 0.018996523693203926, "rewards/rejected": -0.0783717930316925, "sft_loss": 1.1875053644180298, "step": 635 }, { "epoch": 0.512, "grad_norm": 8.97887669520296, "learning_rate": 4.924325304226745e-06, "logits/chosen": -0.509232223033905, "logits/rejected": -0.804535984992981, "logps/chosen": -0.9804172515869141, "logps/rejected": -1.4311110973358154, "loss": 1.2258, "odds_ratio_loss": 0.47105565667152405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04902086406946182, "rewards/margins": 0.022534683346748352, "rewards/rejected": -0.07155554741621017, "sft_loss": 0.9804172515869141, "step": 640 }, { "epoch": 0.516, "grad_norm": 12.339274811909346, "learning_rate": 4.921457902821578e-06, "logits/chosen": -0.6703714728355408, "logits/rejected": -0.7211524844169617, "logps/chosen": -1.412858247756958, "logps/rejected": -1.7014133930206299, "loss": 1.2609, "odds_ratio_loss": 0.6318767666816711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07064291089773178, "rewards/margins": 0.014427749440073967, "rewards/rejected": -0.0850706622004509, "sft_loss": 1.412858247756958, "step": 645 }, { "epoch": 0.52, "grad_norm": 6.902631133651239, "learning_rate": 4.91853804865716e-06, "logits/chosen": -0.423556387424469, "logits/rejected": -0.6117393970489502, "logps/chosen": -0.9919289350509644, "logps/rejected": -1.5979692935943604, "loss": 1.2347, "odds_ratio_loss": 0.4265903830528259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.049596451222896576, "rewards/margins": 0.0303020179271698, "rewards/rejected": -0.07989846915006638, "sft_loss": 0.9919289350509644, "step": 650 }, { "epoch": 0.524, "grad_norm": 20.833564963024443, "learning_rate": 4.915565804982332e-06, "logits/chosen": -0.6421648263931274, "logits/rejected": -0.2954399287700653, "logps/chosen": -1.2931578159332275, "logps/rejected": -1.3003164529800415, "loss": 1.2488, "odds_ratio_loss": 0.7667359113693237, "rewards/accuracies": 0.5, "rewards/chosen": -0.06465788185596466, "rewards/margins": 0.0003579244075808674, "rewards/rejected": -0.06501581519842148, "sft_loss": 1.2931578159332275, "step": 655 }, { "epoch": 0.528, "grad_norm": 5.965871281558726, "learning_rate": 4.912541236180779e-06, "logits/chosen": -0.6520070433616638, "logits/rejected": -0.7092788815498352, "logps/chosen": -1.151734471321106, "logps/rejected": -1.186388611793518, "loss": 1.1441, "odds_ratio_loss": 0.7639530301094055, "rewards/accuracies": 0.5, "rewards/chosen": -0.05758672207593918, "rewards/margins": 0.0017327029490843415, "rewards/rejected": -0.059319429099559784, "sft_loss": 1.151734471321106, "step": 660 }, { "epoch": 0.532, "grad_norm": 6.664910493167132, "learning_rate": 4.909464407769633e-06, "logits/chosen": -0.5549234747886658, "logits/rejected": -0.8910662531852722, "logps/chosen": -1.2270019054412842, "logps/rejected": -1.200127363204956, "loss": 1.23, "odds_ratio_loss": 0.8013579249382019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06135009601712227, "rewards/margins": -0.0013437264133244753, "rewards/rejected": -0.060006361454725266, "sft_loss": 1.2270019054412842, "step": 665 }, { "epoch": 0.536, "grad_norm": 5.1055719510419335, "learning_rate": 4.9063353863980565e-06, "logits/chosen": -0.4341130256652832, "logits/rejected": -0.791808545589447, "logps/chosen": -1.1596591472625732, "logps/rejected": -1.5821669101715088, "loss": 1.2846, "odds_ratio_loss": 0.5709139108657837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05798295885324478, "rewards/margins": 0.021125389263033867, "rewards/rejected": -0.0791083499789238, "sft_loss": 1.1596591472625732, "step": 670 }, { "epoch": 0.54, "grad_norm": 7.5751885145020506, "learning_rate": 4.903154239845798e-06, "logits/chosen": -0.4769849181175232, "logits/rejected": -0.5648764967918396, "logps/chosen": -1.1110217571258545, "logps/rejected": -1.4287192821502686, "loss": 1.2388, "odds_ratio_loss": 0.5047262907028198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05555109307169914, "rewards/margins": 0.015884879976511, "rewards/rejected": -0.07143596559762955, "sft_loss": 1.1110217571258545, "step": 675 }, { "epoch": 0.544, "grad_norm": 7.921191214189487, "learning_rate": 4.899921037021719e-06, "logits/chosen": -0.36087316274642944, "logits/rejected": -0.1616624891757965, "logps/chosen": -1.1064417362213135, "logps/rejected": -0.9575250744819641, "loss": 1.2096, "odds_ratio_loss": 0.8907485008239746, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.05532208830118179, "rewards/margins": -0.007445829920470715, "rewards/rejected": -0.047876257449388504, "sft_loss": 1.1064417362213135, "step": 680 }, { "epoch": 0.548, "grad_norm": 12.574180795892904, "learning_rate": 4.896635847962311e-06, "logits/chosen": -0.6308740973472595, "logits/rejected": -0.6598986387252808, "logps/chosen": -1.0816127061843872, "logps/rejected": -1.5012505054473877, "loss": 1.2037, "odds_ratio_loss": 0.6682350635528564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05408062785863876, "rewards/margins": 0.02098189666867256, "rewards/rejected": -0.07506252825260162, "sft_loss": 1.0816127061843872, "step": 685 }, { "epoch": 0.552, "grad_norm": 11.992784752793785, "learning_rate": 4.893298743830168e-06, "logits/chosen": -0.44944000244140625, "logits/rejected": -0.8219810724258423, "logps/chosen": -1.5772783756256104, "logps/rejected": -1.2028121948242188, "loss": 1.2864, "odds_ratio_loss": 1.1701303720474243, "rewards/accuracies": 0.5, "rewards/chosen": -0.07886390388011932, "rewards/margins": -0.018723303452134132, "rewards/rejected": -0.06014060229063034, "sft_loss": 1.5772783756256104, "step": 690 }, { "epoch": 0.556, "grad_norm": 8.358855854874342, "learning_rate": 4.889909796912454e-06, "logits/chosen": -0.28839197754859924, "logits/rejected": -0.44521206617355347, "logps/chosen": -1.1581156253814697, "logps/rejected": -1.1767852306365967, "loss": 1.2989, "odds_ratio_loss": 0.6894288063049316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.057905782014131546, "rewards/margins": 0.0009334773058071733, "rewards/rejected": -0.058839261531829834, "sft_loss": 1.1581156253814697, "step": 695 }, { "epoch": 0.56, "grad_norm": 6.831254137005319, "learning_rate": 4.88646908061933e-06, "logits/chosen": -0.6353722214698792, "logits/rejected": -0.7715078592300415, "logps/chosen": -1.5473369359970093, "logps/rejected": -1.5499107837677002, "loss": 1.2584, "odds_ratio_loss": 0.7936614751815796, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07736685127019882, "rewards/margins": 0.00012868604972027242, "rewards/rejected": -0.0774955302476883, "sft_loss": 1.5473369359970093, "step": 700 }, { "epoch": 0.564, "grad_norm": 10.218090713494131, "learning_rate": 4.882976669482368e-06, "logits/chosen": -0.6931608319282532, "logits/rejected": -0.6439999341964722, "logps/chosen": -1.328382968902588, "logps/rejected": -1.6422935724258423, "loss": 1.3709, "odds_ratio_loss": 0.5778505802154541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06641916185617447, "rewards/margins": 0.015695523470640182, "rewards/rejected": -0.08211468160152435, "sft_loss": 1.328382968902588, "step": 705 }, { "epoch": 0.568, "grad_norm": 5.768384836009137, "learning_rate": 4.879432639152935e-06, "logits/chosen": -0.5731868147850037, "logits/rejected": -0.9456470608711243, "logps/chosen": -0.9817520976066589, "logps/rejected": -1.3865406513214111, "loss": 1.2994, "odds_ratio_loss": 0.48160356283187866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.049087610095739365, "rewards/margins": 0.02023942954838276, "rewards/rejected": -0.06932704150676727, "sft_loss": 0.9817520976066589, "step": 710 }, { "epoch": 0.572, "grad_norm": 7.509810654555548, "learning_rate": 4.875837066400553e-06, "logits/chosen": -0.691825807094574, "logits/rejected": -0.5271461606025696, "logps/chosen": -0.9871689677238464, "logps/rejected": -1.324852705001831, "loss": 1.2221, "odds_ratio_loss": 0.5907098054885864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04935844987630844, "rewards/margins": 0.016884196549654007, "rewards/rejected": -0.06624264270067215, "sft_loss": 0.9871689677238464, "step": 715 }, { "epoch": 0.576, "grad_norm": 24.694839731628768, "learning_rate": 4.8721900291112415e-06, "logits/chosen": -0.8862325549125671, "logits/rejected": -0.5018847584724426, "logps/chosen": -1.0031644105911255, "logps/rejected": -1.247604250907898, "loss": 1.2261, "odds_ratio_loss": 0.6080331802368164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05015822499990463, "rewards/margins": 0.012221988290548325, "rewards/rejected": -0.06238021329045296, "sft_loss": 1.0031644105911255, "step": 720 }, { "epoch": 0.58, "grad_norm": 26.923254409620736, "learning_rate": 4.868491606285823e-06, "logits/chosen": -0.5057476162910461, "logits/rejected": -0.7251291275024414, "logps/chosen": -1.470099687576294, "logps/rejected": -1.3734233379364014, "loss": 1.3296, "odds_ratio_loss": 0.9267051815986633, "rewards/accuracies": 0.5, "rewards/chosen": -0.07350499927997589, "rewards/margins": -0.004833821672946215, "rewards/rejected": -0.06867116689682007, "sft_loss": 1.470099687576294, "step": 725 }, { "epoch": 0.584, "grad_norm": 7.574149129374884, "learning_rate": 4.864741878038218e-06, "logits/chosen": -0.46996521949768066, "logits/rejected": -0.6176045536994934, "logps/chosen": -1.454437017440796, "logps/rejected": -1.3104689121246338, "loss": 1.2605, "odds_ratio_loss": 0.8346797823905945, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07272185385227203, "rewards/margins": -0.007198403589427471, "rewards/rejected": -0.06552345305681229, "sft_loss": 1.454437017440796, "step": 730 }, { "epoch": 0.588, "grad_norm": 8.305352840446517, "learning_rate": 4.860940925593703e-06, "logits/chosen": -0.30859681963920593, "logits/rejected": -1.0121045112609863, "logps/chosen": -1.1916425228118896, "logps/rejected": -1.6442506313323975, "loss": 1.2909, "odds_ratio_loss": 0.5468284487724304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05958213284611702, "rewards/margins": 0.02263040840625763, "rewards/rejected": -0.08221253752708435, "sft_loss": 1.1916425228118896, "step": 735 }, { "epoch": 0.592, "grad_norm": 30.679914716790897, "learning_rate": 4.857088831287158e-06, "logits/chosen": -0.3152746558189392, "logits/rejected": -0.498546838760376, "logps/chosen": -1.2811144590377808, "logps/rejected": -1.4738355875015259, "loss": 1.242, "odds_ratio_loss": 0.6398525834083557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06405572593212128, "rewards/margins": 0.009636052884161472, "rewards/rejected": -0.07369177788496017, "sft_loss": 1.2811144590377808, "step": 740 }, { "epoch": 0.596, "grad_norm": 7.06148057428711, "learning_rate": 4.85318567856128e-06, "logits/chosen": -0.6131590604782104, "logits/rejected": -0.22784848511219025, "logps/chosen": -1.190834879875183, "logps/rejected": -1.4315545558929443, "loss": 1.211, "odds_ratio_loss": 0.6610680818557739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.059541743248701096, "rewards/margins": 0.012035989202558994, "rewards/rejected": -0.07157773524522781, "sft_loss": 1.190834879875183, "step": 745 }, { "epoch": 0.6, "grad_norm": 10.554654256963408, "learning_rate": 4.849231551964771e-06, "logits/chosen": -0.30865949392318726, "logits/rejected": -0.6756810545921326, "logps/chosen": -0.9886703491210938, "logps/rejected": -1.524072289466858, "loss": 1.149, "odds_ratio_loss": 0.4154701828956604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.049433521926403046, "rewards/margins": 0.0267700906842947, "rewards/rejected": -0.0762036144733429, "sft_loss": 0.9886703491210938, "step": 750 }, { "epoch": 0.604, "grad_norm": 8.5013064131026, "learning_rate": 4.8452265371505176e-06, "logits/chosen": -0.3945849537849426, "logits/rejected": -0.6222106218338013, "logps/chosen": -1.2622836828231812, "logps/rejected": -1.3229162693023682, "loss": 1.2813, "odds_ratio_loss": 0.7592854499816895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06311418116092682, "rewards/margins": 0.0030316333286464214, "rewards/rejected": -0.06614581495523453, "sft_loss": 1.2622836828231812, "step": 755 }, { "epoch": 0.608, "grad_norm": 6.339074653596066, "learning_rate": 4.841170720873723e-06, "logits/chosen": -0.6454433798789978, "logits/rejected": -0.5933985710144043, "logps/chosen": -1.0739322900772095, "logps/rejected": -1.2623248100280762, "loss": 1.2882, "odds_ratio_loss": 0.6267717480659485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.053696613758802414, "rewards/margins": 0.009419633075594902, "rewards/rejected": -0.06311623752117157, "sft_loss": 1.0739322900772095, "step": 760 }, { "epoch": 0.612, "grad_norm": 9.528245360044645, "learning_rate": 4.837064190990036e-06, "logits/chosen": -0.5372971296310425, "logits/rejected": -0.4304935932159424, "logps/chosen": -1.3918521404266357, "logps/rejected": -1.2833397388458252, "loss": 1.2469, "odds_ratio_loss": 0.8755296468734741, "rewards/accuracies": 0.5, "rewards/chosen": -0.06959259510040283, "rewards/margins": -0.005425615236163139, "rewards/rejected": -0.06416698545217514, "sft_loss": 1.3918521404266357, "step": 765 }, { "epoch": 0.616, "grad_norm": 45.4719267607946, "learning_rate": 4.832907036453647e-06, "logits/chosen": -0.6772319078445435, "logits/rejected": -0.29380422830581665, "logps/chosen": -1.3806759119033813, "logps/rejected": -1.4640812873840332, "loss": 1.2546, "odds_ratio_loss": 0.7038690447807312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06903379410505295, "rewards/margins": 0.00417026923969388, "rewards/rejected": -0.07320406287908554, "sft_loss": 1.3806759119033813, "step": 770 }, { "epoch": 0.62, "grad_norm": 6.67487476307786, "learning_rate": 4.828699347315357e-06, "logits/chosen": -0.43924084305763245, "logits/rejected": -0.685263991355896, "logps/chosen": -1.3941123485565186, "logps/rejected": -1.5404117107391357, "loss": 1.3004, "odds_ratio_loss": 0.6843758821487427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06970562040805817, "rewards/margins": 0.0073149665258824825, "rewards/rejected": -0.07702059298753738, "sft_loss": 1.3941123485565186, "step": 775 }, { "epoch": 0.624, "grad_norm": 8.063952217109858, "learning_rate": 4.824441214720629e-06, "logits/chosen": -0.4597851634025574, "logits/rejected": -0.7097247242927551, "logps/chosen": -1.4838628768920898, "logps/rejected": -1.552299976348877, "loss": 1.3045, "odds_ratio_loss": 0.7198641300201416, "rewards/accuracies": 0.5, "rewards/chosen": -0.07419314980506897, "rewards/margins": 0.0034218481741845608, "rewards/rejected": -0.07761499285697937, "sft_loss": 1.4838628768920898, "step": 780 }, { "epoch": 0.628, "grad_norm": 8.186475314952034, "learning_rate": 4.8201327309076176e-06, "logits/chosen": -0.6055313944816589, "logits/rejected": -0.5660229325294495, "logps/chosen": -1.3449480533599854, "logps/rejected": -1.3947374820709229, "loss": 1.2475, "odds_ratio_loss": 0.701248824596405, "rewards/accuracies": 0.5, "rewards/chosen": -0.0672474056482315, "rewards/margins": 0.0024894648231565952, "rewards/rejected": -0.06973687559366226, "sft_loss": 1.3449480533599854, "step": 785 }, { "epoch": 0.632, "grad_norm": 11.28386887635978, "learning_rate": 4.815773989205165e-06, "logits/chosen": -0.4913761615753174, "logits/rejected": -0.7349493503570557, "logps/chosen": -1.3903801441192627, "logps/rejected": -1.261516809463501, "loss": 1.2877, "odds_ratio_loss": 0.858357310295105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06951900571584702, "rewards/margins": -0.006443170364946127, "rewards/rejected": -0.06307584047317505, "sft_loss": 1.3903801441192627, "step": 790 }, { "epoch": 0.636, "grad_norm": 15.974062267930158, "learning_rate": 4.811365084030784e-06, "logits/chosen": -0.6103811264038086, "logits/rejected": -0.6908004879951477, "logps/chosen": -1.1836979389190674, "logps/rejected": -1.4412634372711182, "loss": 1.2186, "odds_ratio_loss": 0.599416971206665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05918489769101143, "rewards/margins": 0.012878276407718658, "rewards/rejected": -0.07206316292285919, "sft_loss": 1.1836979389190674, "step": 795 }, { "epoch": 0.64, "grad_norm": 9.332345967135447, "learning_rate": 4.806906110888606e-06, "logits/chosen": -0.10099928081035614, "logits/rejected": -0.21330437064170837, "logps/chosen": -0.9208562970161438, "logps/rejected": -1.2308391332626343, "loss": 1.2217, "odds_ratio_loss": 0.5371675491333008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04604281857609749, "rewards/margins": 0.01549914013594389, "rewards/rejected": -0.061541955918073654, "sft_loss": 0.9208562970161438, "step": 800 }, { "epoch": 0.644, "grad_norm": 6.712188908944328, "learning_rate": 4.8023971663673235e-06, "logits/chosen": -0.45073431730270386, "logits/rejected": -0.6796762347221375, "logps/chosen": -1.2248754501342773, "logps/rejected": -1.1859427690505981, "loss": 1.2317, "odds_ratio_loss": 0.9216899871826172, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06124377250671387, "rewards/margins": -0.0019466314697638154, "rewards/rejected": -0.059297144412994385, "sft_loss": 1.2248754501342773, "step": 805 }, { "epoch": 0.648, "grad_norm": 6.089598640062157, "learning_rate": 4.7978383481380865e-06, "logits/chosen": -0.7606781125068665, "logits/rejected": -0.34865307807922363, "logps/chosen": -1.3903577327728271, "logps/rejected": -1.3126481771469116, "loss": 1.2731, "odds_ratio_loss": 0.8095407485961914, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06951788067817688, "rewards/margins": -0.003885473357513547, "rewards/rejected": -0.0656324103474617, "sft_loss": 1.3903577327728271, "step": 810 }, { "epoch": 0.652, "grad_norm": 8.775544099907432, "learning_rate": 4.793229754952393e-06, "logits/chosen": -0.9465087652206421, "logits/rejected": -0.8528397679328918, "logps/chosen": -1.3370808362960815, "logps/rejected": -1.6353280544281006, "loss": 1.2299, "odds_ratio_loss": 0.5765669941902161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06685404479503632, "rewards/margins": 0.01491236686706543, "rewards/rejected": -0.08176640421152115, "sft_loss": 1.3370808362960815, "step": 815 }, { "epoch": 0.656, "grad_norm": 9.946349607495078, "learning_rate": 4.788571486639948e-06, "logits/chosen": -0.538291335105896, "logits/rejected": -0.8991764783859253, "logps/chosen": -1.3249986171722412, "logps/rejected": -1.2387126684188843, "loss": 1.1861, "odds_ratio_loss": 0.8351901173591614, "rewards/accuracies": 0.5, "rewards/chosen": -0.06624993681907654, "rewards/margins": -0.004314306192100048, "rewards/rejected": -0.061935633420944214, "sft_loss": 1.3249986171722412, "step": 820 }, { "epoch": 0.66, "grad_norm": 13.684922078518843, "learning_rate": 4.783863644106502e-06, "logits/chosen": -0.40957608819007874, "logits/rejected": -1.0564932823181152, "logps/chosen": -1.2083319425582886, "logps/rejected": -1.3241065740585327, "loss": 1.2727, "odds_ratio_loss": 0.7248133420944214, "rewards/accuracies": 0.5, "rewards/chosen": -0.06041660159826279, "rewards/margins": 0.005788723472505808, "rewards/rejected": -0.06620533019304276, "sft_loss": 1.2083319425582886, "step": 825 }, { "epoch": 0.664, "grad_norm": 14.240624119385929, "learning_rate": 4.779106329331665e-06, "logits/chosen": -0.6208971738815308, "logits/rejected": -0.53315269947052, "logps/chosen": -1.327461838722229, "logps/rejected": -1.1612827777862549, "loss": 1.3201, "odds_ratio_loss": 0.9040085077285767, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06637309491634369, "rewards/margins": -0.008308951742947102, "rewards/rejected": -0.058064140379428864, "sft_loss": 1.327461838722229, "step": 830 }, { "epoch": 0.668, "grad_norm": 6.175231699145468, "learning_rate": 4.774299645366696e-06, "logits/chosen": -0.5456127524375916, "logits/rejected": -0.816623330116272, "logps/chosen": -1.1675437688827515, "logps/rejected": -1.4906387329101562, "loss": 1.2384, "odds_ratio_loss": 0.5253651142120361, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.058377187699079514, "rewards/margins": 0.016154741868376732, "rewards/rejected": -0.0745319351553917, "sft_loss": 1.1675437688827515, "step": 835 }, { "epoch": 0.672, "grad_norm": 9.640667146514005, "learning_rate": 4.769443696332272e-06, "logits/chosen": -0.2830110788345337, "logits/rejected": -0.8570443391799927, "logps/chosen": -1.2739179134368896, "logps/rejected": -1.4723364114761353, "loss": 1.223, "odds_ratio_loss": 0.7033334374427795, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06369589269161224, "rewards/margins": 0.009920930489897728, "rewards/rejected": -0.07361682504415512, "sft_loss": 1.2739179134368896, "step": 840 }, { "epoch": 0.676, "grad_norm": 12.645968569930028, "learning_rate": 4.764538587416233e-06, "logits/chosen": -0.8710149526596069, "logits/rejected": -0.8611608743667603, "logps/chosen": -1.452150583267212, "logps/rejected": -2.0592668056488037, "loss": 1.3282, "odds_ratio_loss": 0.6635312438011169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07260753214359283, "rewards/margins": 0.03035581484436989, "rewards/rejected": -0.10296335071325302, "sft_loss": 1.452150583267212, "step": 845 }, { "epoch": 0.68, "grad_norm": 9.432527411843669, "learning_rate": 4.759584424871302e-06, "logits/chosen": -0.5329976081848145, "logits/rejected": -0.8772062063217163, "logps/chosen": -0.9151167869567871, "logps/rejected": -1.6539852619171143, "loss": 1.2911, "odds_ratio_loss": 0.4387364387512207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.045755840837955475, "rewards/margins": 0.03694342449307442, "rewards/rejected": -0.082699254155159, "sft_loss": 0.9151167869567871, "step": 850 }, { "epoch": 0.684, "grad_norm": 7.726661197517004, "learning_rate": 4.754581316012785e-06, "logits/chosen": -0.4633726477622986, "logits/rejected": -0.5861400961875916, "logps/chosen": -0.6932090520858765, "logps/rejected": -1.0552204847335815, "loss": 1.1924, "odds_ratio_loss": 0.4030598998069763, "rewards/accuracies": 1.0, "rewards/chosen": -0.034660451114177704, "rewards/margins": 0.018100565299391747, "rewards/rejected": -0.0527610182762146, "sft_loss": 0.6932090520858765, "step": 855 }, { "epoch": 0.688, "grad_norm": 9.894504014422436, "learning_rate": 4.749529369216246e-06, "logits/chosen": -0.42406773567199707, "logits/rejected": -0.4842946529388428, "logps/chosen": -1.0714144706726074, "logps/rejected": -1.1619442701339722, "loss": 1.352, "odds_ratio_loss": 0.6463958024978638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05357072502374649, "rewards/margins": 0.0045264954678714275, "rewards/rejected": -0.05809721350669861, "sft_loss": 1.0714144706726074, "step": 860 }, { "epoch": 0.692, "grad_norm": 9.153094977007724, "learning_rate": 4.744428693915158e-06, "logits/chosen": -0.2847048342227936, "logits/rejected": -0.4418734610080719, "logps/chosen": -1.2593896389007568, "logps/rejected": -0.9671368598937988, "loss": 1.3217, "odds_ratio_loss": 0.9432659149169922, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -0.06296949088573456, "rewards/margins": -0.014612642116844654, "rewards/rejected": -0.04835684597492218, "sft_loss": 1.2593896389007568, "step": 865 }, { "epoch": 0.696, "grad_norm": 7.840444386866316, "learning_rate": 4.7392794005985324e-06, "logits/chosen": -0.4391769468784332, "logits/rejected": -0.9956706762313843, "logps/chosen": -1.0819652080535889, "logps/rejected": -1.3912889957427979, "loss": 1.1336, "odds_ratio_loss": 0.5440673232078552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.054098259657621384, "rewards/margins": 0.01546618901193142, "rewards/rejected": -0.06956445425748825, "sft_loss": 1.0819652080535889, "step": 870 }, { "epoch": 0.7, "grad_norm": 12.411494501286214, "learning_rate": 4.734081600808531e-06, "logits/chosen": -0.9742234349250793, "logits/rejected": -0.4317501485347748, "logps/chosen": -1.1081851720809937, "logps/rejected": -1.167130947113037, "loss": 1.2256, "odds_ratio_loss": 0.6975724697113037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0554092600941658, "rewards/margins": 0.002947291126474738, "rewards/rejected": -0.058356545865535736, "sft_loss": 1.1081851720809937, "step": 875 }, { "epoch": 0.704, "grad_norm": 8.306111344864336, "learning_rate": 4.7288354071380415e-06, "logits/chosen": -0.5642324686050415, "logits/rejected": -0.8995448350906372, "logps/chosen": -1.0954984426498413, "logps/rejected": -1.2880576848983765, "loss": 1.2105, "odds_ratio_loss": 0.6666485071182251, "rewards/accuracies": 0.5, "rewards/chosen": -0.0547749288380146, "rewards/margins": 0.009627962484955788, "rewards/rejected": -0.06440288573503494, "sft_loss": 1.0954984426498413, "step": 880 }, { "epoch": 0.708, "grad_norm": 10.907946303534318, "learning_rate": 4.723540933228245e-06, "logits/chosen": -0.8124354481697083, "logits/rejected": -0.5376805067062378, "logps/chosen": -1.105232834815979, "logps/rejected": -1.4322589635849, "loss": 1.2672, "odds_ratio_loss": 0.5457401871681213, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05526164174079895, "rewards/margins": 0.016351303085684776, "rewards/rejected": -0.07161294668912888, "sft_loss": 1.105232834815979, "step": 885 }, { "epoch": 0.712, "grad_norm": 11.344492337732651, "learning_rate": 4.7181982937661485e-06, "logits/chosen": -0.6914325952529907, "logits/rejected": -0.9896718859672546, "logps/chosen": -1.3581817150115967, "logps/rejected": -1.8004964590072632, "loss": 1.2577, "odds_ratio_loss": 0.5711563229560852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06790908426046371, "rewards/margins": 0.02211574651300907, "rewards/rejected": -0.09002482891082764, "sft_loss": 1.3581817150115967, "step": 890 }, { "epoch": 0.716, "grad_norm": 14.944195714469451, "learning_rate": 4.712807604482108e-06, "logits/chosen": -0.580794632434845, "logits/rejected": -0.6542994379997253, "logps/chosen": -1.1799992322921753, "logps/rejected": -1.1197493076324463, "loss": 1.2151, "odds_ratio_loss": 0.8679493069648743, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.058999963104724884, "rewards/margins": -0.0030124965123832226, "rewards/rejected": -0.05598746985197067, "sft_loss": 1.1799992322921753, "step": 895 }, { "epoch": 0.72, "grad_norm": 6.69431914838341, "learning_rate": 4.707368982147318e-06, "logits/chosen": -0.26036280393600464, "logits/rejected": -0.8896979093551636, "logps/chosen": -1.174317479133606, "logps/rejected": -0.9866326451301575, "loss": 1.2563, "odds_ratio_loss": 1.064286470413208, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.058715879917144775, "rewards/margins": -0.009384247474372387, "rewards/rejected": -0.049331631511449814, "sft_loss": 1.174317479133606, "step": 900 }, { "epoch": 0.724, "grad_norm": 4.7835304167015975, "learning_rate": 4.701882544571277e-06, "logits/chosen": -0.9345399141311646, "logits/rejected": -0.6917155385017395, "logps/chosen": -1.392803430557251, "logps/rejected": -1.371321439743042, "loss": 1.1668, "odds_ratio_loss": 0.7409270405769348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06964017450809479, "rewards/margins": -0.001074108062312007, "rewards/rejected": -0.06856606900691986, "sft_loss": 1.392803430557251, "step": 905 }, { "epoch": 0.728, "grad_norm": 11.02550077384499, "learning_rate": 4.696348410599244e-06, "logits/chosen": -0.5705204010009766, "logits/rejected": -0.6748573184013367, "logps/chosen": -1.0039920806884766, "logps/rejected": -1.592974066734314, "loss": 1.2679, "odds_ratio_loss": 0.5323291420936584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05019960552453995, "rewards/margins": 0.029449105262756348, "rewards/rejected": -0.0796487107872963, "sft_loss": 1.0039920806884766, "step": 910 }, { "epoch": 0.732, "grad_norm": 10.692588852434419, "learning_rate": 4.690766700109659e-06, "logits/chosen": -0.4835734963417053, "logits/rejected": -0.7319347858428955, "logps/chosen": -1.1647623777389526, "logps/rejected": -2.5557546615600586, "loss": 1.2441, "odds_ratio_loss": 0.41198450326919556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05823811888694763, "rewards/margins": 0.06954962015151978, "rewards/rejected": -0.1277877390384674, "sft_loss": 1.1647623777389526, "step": 915 }, { "epoch": 0.736, "grad_norm": 10.396971419498014, "learning_rate": 4.685137534011549e-06, "logits/chosen": -0.8939110040664673, "logits/rejected": -0.3123689293861389, "logps/chosen": -1.2236140966415405, "logps/rejected": -1.3663610219955444, "loss": 1.1922, "odds_ratio_loss": 0.7821038365364075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06118069961667061, "rewards/margins": 0.007137349806725979, "rewards/rejected": -0.06831805408000946, "sft_loss": 1.2236140966415405, "step": 920 }, { "epoch": 0.74, "grad_norm": 8.738002331287573, "learning_rate": 4.679461034241906e-06, "logits/chosen": -0.6164524555206299, "logits/rejected": -0.7329466938972473, "logps/chosen": -1.4917128086090088, "logps/rejected": -1.479864478111267, "loss": 1.2453, "odds_ratio_loss": 0.770171046257019, "rewards/accuracies": 0.5, "rewards/chosen": -0.07458565384149551, "rewards/margins": -0.0005924153956584632, "rewards/rejected": -0.07399322092533112, "sft_loss": 1.4917128086090088, "step": 925 }, { "epoch": 0.744, "grad_norm": 23.09910613175578, "learning_rate": 4.673737323763048e-06, "logits/chosen": -0.6520587205886841, "logits/rejected": -0.7833204865455627, "logps/chosen": -1.1136635541915894, "logps/rejected": -1.1649823188781738, "loss": 1.1591, "odds_ratio_loss": 0.724600613117218, "rewards/accuracies": 0.5, "rewards/chosen": -0.05568317696452141, "rewards/margins": 0.0025659373495727777, "rewards/rejected": -0.05824911594390869, "sft_loss": 1.1136635541915894, "step": 930 }, { "epoch": 0.748, "grad_norm": 9.576889593824372, "learning_rate": 4.667966526559953e-06, "logits/chosen": -0.7509557008743286, "logits/rejected": -0.49857956171035767, "logps/chosen": -1.3980618715286255, "logps/rejected": -1.2008607387542725, "loss": 1.3096, "odds_ratio_loss": 0.9694040417671204, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06990310549736023, "rewards/margins": -0.009860062971711159, "rewards/rejected": -0.06004303693771362, "sft_loss": 1.3980618715286255, "step": 935 }, { "epoch": 0.752, "grad_norm": 8.475227307177539, "learning_rate": 4.662148767637578e-06, "logits/chosen": -0.5015154480934143, "logits/rejected": -0.7089800834655762, "logps/chosen": -1.1982038021087646, "logps/rejected": -1.4972496032714844, "loss": 1.2186, "odds_ratio_loss": 0.5610604286193848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.059910185635089874, "rewards/margins": 0.014952296391129494, "rewards/rejected": -0.07486248016357422, "sft_loss": 1.1982038021087646, "step": 940 }, { "epoch": 0.756, "grad_norm": 10.570484526302579, "learning_rate": 4.656284173018144e-06, "logits/chosen": -0.6629132628440857, "logits/rejected": -0.8142105340957642, "logps/chosen": -1.2457422018051147, "logps/rejected": -1.447788953781128, "loss": 1.2879, "odds_ratio_loss": 0.5766414403915405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0622871108353138, "rewards/margins": 0.010102340951561928, "rewards/rejected": -0.07238946110010147, "sft_loss": 1.2457422018051147, "step": 945 }, { "epoch": 0.76, "grad_norm": 8.792884078555856, "learning_rate": 4.650372869738415e-06, "logits/chosen": -0.38089218735694885, "logits/rejected": -0.724978506565094, "logps/chosen": -1.0249511003494263, "logps/rejected": -1.4063374996185303, "loss": 1.3351, "odds_ratio_loss": 0.5218092799186707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05124755948781967, "rewards/margins": 0.01906932331621647, "rewards/rejected": -0.07031688839197159, "sft_loss": 1.0249511003494263, "step": 950 }, { "epoch": 0.764, "grad_norm": 9.019548926117617, "learning_rate": 4.644414985846934e-06, "logits/chosen": -0.5280020833015442, "logits/rejected": -1.183532953262329, "logps/chosen": -1.1438463926315308, "logps/rejected": -1.5815012454986572, "loss": 1.311, "odds_ratio_loss": 0.4890708029270172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.057192325592041016, "rewards/margins": 0.02188274636864662, "rewards/rejected": -0.07907506823539734, "sft_loss": 1.1438463926315308, "step": 955 }, { "epoch": 0.768, "grad_norm": 10.010010873228056, "learning_rate": 4.638410650401267e-06, "logits/chosen": -0.6906821131706238, "logits/rejected": -0.42066025733947754, "logps/chosen": -0.9353683590888977, "logps/rejected": -1.1939796209335327, "loss": 1.2082, "odds_ratio_loss": 0.6122977137565613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.046768419444561005, "rewards/margins": 0.012930569238960743, "rewards/rejected": -0.059698980301618576, "sft_loss": 0.9353683590888977, "step": 960 }, { "epoch": 0.772, "grad_norm": 13.636523513835607, "learning_rate": 4.632359993465188e-06, "logits/chosen": -0.7019423842430115, "logits/rejected": -0.8399526476860046, "logps/chosen": -1.0443174839019775, "logps/rejected": -1.1119390726089478, "loss": 1.2281, "odds_ratio_loss": 0.7568255066871643, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05221586674451828, "rewards/margins": 0.0033810839522629976, "rewards/rejected": -0.055596958845853806, "sft_loss": 1.0443174839019775, "step": 965 }, { "epoch": 0.776, "grad_norm": 10.680468517829564, "learning_rate": 4.626263146105875e-06, "logits/chosen": -0.7569029927253723, "logits/rejected": -1.1442753076553345, "logps/chosen": -1.0383312702178955, "logps/rejected": -1.2682640552520752, "loss": 1.2608, "odds_ratio_loss": 0.715472936630249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.051916562020778656, "rewards/margins": 0.011496638879179955, "rewards/rejected": -0.06341320276260376, "sft_loss": 1.0383312702178955, "step": 970 }, { "epoch": 0.78, "grad_norm": 16.270504514158986, "learning_rate": 4.620120240391065e-06, "logits/chosen": -0.5434718728065491, "logits/rejected": -0.8585309982299805, "logps/chosen": -1.0939325094223022, "logps/rejected": -1.3599112033843994, "loss": 1.2639, "odds_ratio_loss": 0.7648831605911255, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.05469663068652153, "rewards/margins": 0.01329893060028553, "rewards/rejected": -0.06799556314945221, "sft_loss": 1.0939325094223022, "step": 975 }, { "epoch": 0.784, "grad_norm": 5.804422084174317, "learning_rate": 4.613931409386196e-06, "logits/chosen": -0.7977913022041321, "logits/rejected": -0.9274693727493286, "logps/chosen": -1.18310546875, "logps/rejected": -1.8053483963012695, "loss": 1.2574, "odds_ratio_loss": 0.4495466351509094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.059155285358428955, "rewards/margins": 0.031112130731344223, "rewards/rejected": -0.09026740491390228, "sft_loss": 1.18310546875, "step": 980 }, { "epoch": 0.788, "grad_norm": 9.788416898738078, "learning_rate": 4.607696787151522e-06, "logits/chosen": -0.9438807368278503, "logits/rejected": -0.6228753328323364, "logps/chosen": -1.5850467681884766, "logps/rejected": -1.510814905166626, "loss": 1.1717, "odds_ratio_loss": 0.8166835904121399, "rewards/accuracies": 0.5, "rewards/chosen": -0.07925233989953995, "rewards/margins": -0.0037115835584700108, "rewards/rejected": -0.07554075121879578, "sft_loss": 1.5850467681884766, "step": 985 }, { "epoch": 0.792, "grad_norm": 12.55818469153721, "learning_rate": 4.601416508739211e-06, "logits/chosen": -0.731402575969696, "logits/rejected": -1.0273559093475342, "logps/chosen": -1.129149317741394, "logps/rejected": -1.219930648803711, "loss": 1.2695, "odds_ratio_loss": 0.6738437414169312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05645746737718582, "rewards/margins": 0.0045390622690320015, "rewards/rejected": -0.06099652498960495, "sft_loss": 1.129149317741394, "step": 990 }, { "epoch": 0.796, "grad_norm": 9.153552767046978, "learning_rate": 4.595090710190419e-06, "logits/chosen": -0.7405273914337158, "logits/rejected": -1.1718242168426514, "logps/chosen": -1.2145724296569824, "logps/rejected": -1.4828448295593262, "loss": 1.1849, "odds_ratio_loss": 0.5643509030342102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06072862073779106, "rewards/margins": 0.013413618318736553, "rewards/rejected": -0.07414223998785019, "sft_loss": 1.2145724296569824, "step": 995 }, { "epoch": 0.8, "grad_norm": 6.94868849947181, "learning_rate": 4.588719528532342e-06, "logits/chosen": -0.7134780883789062, "logits/rejected": -0.905554473400116, "logps/chosen": -1.4175317287445068, "logps/rejected": -1.3552976846694946, "loss": 1.3012, "odds_ratio_loss": 0.8563035130500793, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0708765834569931, "rewards/margins": -0.003111699130386114, "rewards/rejected": -0.06776488572359085, "sft_loss": 1.4175317287445068, "step": 1000 }, { "epoch": 0.804, "grad_norm": 12.921749906218944, "learning_rate": 4.582303101775249e-06, "logits/chosen": -0.7810646891593933, "logits/rejected": -1.1520473957061768, "logps/chosen": -1.2127867937088013, "logps/rejected": -1.4734896421432495, "loss": 1.2652, "odds_ratio_loss": 0.5705010890960693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06063934043049812, "rewards/margins": 0.013035142794251442, "rewards/rejected": -0.07367448508739471, "sft_loss": 1.2127867937088013, "step": 1005 }, { "epoch": 0.808, "grad_norm": 9.197435023909987, "learning_rate": 4.575841568909494e-06, "logits/chosen": -0.6398459672927856, "logits/rejected": -0.5644738078117371, "logps/chosen": -1.268541932106018, "logps/rejected": -1.5668294429779053, "loss": 1.2232, "odds_ratio_loss": 0.6429783701896667, "rewards/accuracies": 0.5, "rewards/chosen": -0.06342709064483643, "rewards/margins": 0.014914381317794323, "rewards/rejected": -0.07834147661924362, "sft_loss": 1.268541932106018, "step": 1010 }, { "epoch": 0.812, "grad_norm": 5.987738310425259, "learning_rate": 4.569335069902502e-06, "logits/chosen": -0.8146150708198547, "logits/rejected": -0.5185267925262451, "logps/chosen": -1.2354018688201904, "logps/rejected": -1.3597724437713623, "loss": 1.2304, "odds_ratio_loss": 0.7136649489402771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06177009269595146, "rewards/margins": 0.006218533962965012, "rewards/rejected": -0.06798862665891647, "sft_loss": 1.2354018688201904, "step": 1015 }, { "epoch": 0.816, "grad_norm": 11.820317098577457, "learning_rate": 4.562783745695738e-06, "logits/chosen": -0.6676985621452332, "logits/rejected": -0.3732604682445526, "logps/chosen": -1.0979827642440796, "logps/rejected": -1.6110618114471436, "loss": 1.2048, "odds_ratio_loss": 0.6128841638565063, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05489913746714592, "rewards/margins": 0.025653952732682228, "rewards/rejected": -0.0805530920624733, "sft_loss": 1.0979827642440796, "step": 1020 }, { "epoch": 0.82, "grad_norm": 12.430320372288909, "learning_rate": 4.556187738201656e-06, "logits/chosen": -0.3898774981498718, "logits/rejected": -1.3098582029342651, "logps/chosen": -0.9205626249313354, "logps/rejected": -1.4675109386444092, "loss": 1.2001, "odds_ratio_loss": 0.6006077527999878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04602813720703125, "rewards/margins": 0.02734740450978279, "rewards/rejected": -0.07337555289268494, "sft_loss": 0.9205626249313354, "step": 1025 }, { "epoch": 0.824, "grad_norm": 8.240787854800642, "learning_rate": 4.549547190300622e-06, "logits/chosen": -0.749883234500885, "logits/rejected": -0.5567277669906616, "logps/chosen": -1.1923110485076904, "logps/rejected": -1.448547124862671, "loss": 1.2761, "odds_ratio_loss": 0.5901089906692505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05961555242538452, "rewards/margins": 0.012811805121600628, "rewards/rejected": -0.07242736220359802, "sft_loss": 1.1923110485076904, "step": 1030 }, { "epoch": 0.828, "grad_norm": 8.576070028214076, "learning_rate": 4.542862245837821e-06, "logits/chosen": -0.4225188195705414, "logits/rejected": -0.694170355796814, "logps/chosen": -1.115942358970642, "logps/rejected": -1.1966840028762817, "loss": 1.3326, "odds_ratio_loss": 0.7018226385116577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.055797118693590164, "rewards/margins": 0.004037079401314259, "rewards/rejected": -0.05983419343829155, "sft_loss": 1.115942358970642, "step": 1035 }, { "epoch": 0.832, "grad_norm": 8.003040390950462, "learning_rate": 4.536133049620143e-06, "logits/chosen": -0.614066481590271, "logits/rejected": -0.9744704365730286, "logps/chosen": -1.1512901782989502, "logps/rejected": -1.686820387840271, "loss": 1.3049, "odds_ratio_loss": 0.5957285761833191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05756450816988945, "rewards/margins": 0.0267765112221241, "rewards/rejected": -0.08434101194143295, "sft_loss": 1.1512901782989502, "step": 1040 }, { "epoch": 0.836, "grad_norm": 7.214120995777482, "learning_rate": 4.529359747413038e-06, "logits/chosen": -0.602199912071228, "logits/rejected": -1.299626350402832, "logps/chosen": -1.2445770502090454, "logps/rejected": -1.3810675144195557, "loss": 1.3109, "odds_ratio_loss": 0.7491277456283569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.062228865921497345, "rewards/margins": 0.00682451855391264, "rewards/rejected": -0.06905338168144226, "sft_loss": 1.2445770502090454, "step": 1045 }, { "epoch": 0.84, "grad_norm": 7.796605464265155, "learning_rate": 4.522542485937369e-06, "logits/chosen": -0.3931456208229065, "logits/rejected": -0.8093290328979492, "logps/chosen": -1.3564033508300781, "logps/rejected": -1.3207740783691406, "loss": 1.2757, "odds_ratio_loss": 0.8102067708969116, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06782017648220062, "rewards/margins": -0.0017814624588936567, "rewards/rejected": -0.06603871285915375, "sft_loss": 1.3564033508300781, "step": 1050 }, { "epoch": 0.844, "grad_norm": 11.038841006173273, "learning_rate": 4.515681412866228e-06, "logits/chosen": -0.8583968281745911, "logits/rejected": -0.5115640163421631, "logps/chosen": -1.3405869007110596, "logps/rejected": -1.1792690753936768, "loss": 1.2427, "odds_ratio_loss": 1.1266233921051025, "rewards/accuracies": 0.5, "rewards/chosen": -0.06702934950590134, "rewards/margins": -0.008065891452133656, "rewards/rejected": -0.058963458985090256, "sft_loss": 1.3405869007110596, "step": 1055 }, { "epoch": 0.848, "grad_norm": 37.58723049143948, "learning_rate": 4.508776676821739e-06, "logits/chosen": -0.6703466773033142, "logits/rejected": -0.9433542490005493, "logps/chosen": -1.2843058109283447, "logps/rejected": -1.6363897323608398, "loss": 1.359, "odds_ratio_loss": 0.5473671555519104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0642152950167656, "rewards/margins": 0.017604198306798935, "rewards/rejected": -0.08181948959827423, "sft_loss": 1.2843058109283447, "step": 1060 }, { "epoch": 0.852, "grad_norm": 12.74608944254742, "learning_rate": 4.501828427371834e-06, "logits/chosen": -0.3732682168483734, "logits/rejected": -0.7298794984817505, "logps/chosen": -1.0062689781188965, "logps/rejected": -1.6333110332489014, "loss": 1.225, "odds_ratio_loss": 0.45508384704589844, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.050313450396060944, "rewards/margins": 0.031352099031209946, "rewards/rejected": -0.08166555315256119, "sft_loss": 1.0062689781188965, "step": 1065 }, { "epoch": 0.856, "grad_norm": 4.647181122267829, "learning_rate": 4.494836815027022e-06, "logits/chosen": -0.744674026966095, "logits/rejected": -0.5776953101158142, "logps/chosen": -1.1960818767547607, "logps/rejected": -1.277769684791565, "loss": 1.1808, "odds_ratio_loss": 0.6548304557800293, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05980409309267998, "rewards/margins": 0.0040843975730240345, "rewards/rejected": -0.06388849020004272, "sft_loss": 1.1960818767547607, "step": 1070 }, { "epoch": 0.86, "grad_norm": 9.112232033291143, "learning_rate": 4.48780199123712e-06, "logits/chosen": -0.48859691619873047, "logits/rejected": -0.8226814270019531, "logps/chosen": -0.9840165972709656, "logps/rejected": -1.3010642528533936, "loss": 1.3173, "odds_ratio_loss": 0.5319587588310242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04920082911849022, "rewards/margins": 0.01585238054394722, "rewards/rejected": -0.06505320966243744, "sft_loss": 0.9840165972709656, "step": 1075 }, { "epoch": 0.864, "grad_norm": 8.357848437646894, "learning_rate": 4.4807241083879774e-06, "logits/chosen": -0.7419109344482422, "logits/rejected": -0.5743356347084045, "logps/chosen": -1.0393415689468384, "logps/rejected": -1.860235571861267, "loss": 1.1895, "odds_ratio_loss": 0.3943214416503906, "rewards/accuracies": 1.0, "rewards/chosen": -0.0519670769572258, "rewards/margins": 0.0410446934401989, "rewards/rejected": -0.0930117815732956, "sft_loss": 1.0393415689468384, "step": 1080 }, { "epoch": 0.868, "grad_norm": 7.001917916478807, "learning_rate": 4.473603319798173e-06, "logits/chosen": -0.8395317196846008, "logits/rejected": -0.760552704334259, "logps/chosen": -1.2476990222930908, "logps/rejected": -1.3389023542404175, "loss": 1.2252, "odds_ratio_loss": 0.6428620219230652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0623849555850029, "rewards/margins": 0.004560163244605064, "rewards/rejected": -0.06694512069225311, "sft_loss": 1.2476990222930908, "step": 1085 }, { "epoch": 0.872, "grad_norm": 7.900019571893691, "learning_rate": 4.466439779715696e-06, "logits/chosen": -0.68291836977005, "logits/rejected": -0.886945903301239, "logps/chosen": -1.321554183959961, "logps/rejected": -1.8369743824005127, "loss": 1.2689, "odds_ratio_loss": 0.5644279718399048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06607771664857864, "rewards/margins": 0.02577100321650505, "rewards/rejected": -0.0918487161397934, "sft_loss": 1.321554183959961, "step": 1090 }, { "epoch": 0.876, "grad_norm": 8.642864900662888, "learning_rate": 4.4592336433146e-06, "logits/chosen": -0.633298397064209, "logits/rejected": -1.161464810371399, "logps/chosen": -1.0371453762054443, "logps/rejected": -1.3207931518554688, "loss": 1.1633, "odds_ratio_loss": 0.5788105726242065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.051857270300388336, "rewards/margins": 0.01418238878250122, "rewards/rejected": -0.06603965908288956, "sft_loss": 1.0371453762054443, "step": 1095 }, { "epoch": 0.88, "grad_norm": 9.05386821771176, "learning_rate": 4.451985066691649e-06, "logits/chosen": -0.7766702771186829, "logits/rejected": -0.5547041893005371, "logps/chosen": -1.1104316711425781, "logps/rejected": -1.3220654726028442, "loss": 1.3197, "odds_ratio_loss": 0.6292269229888916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.055521585047245026, "rewards/margins": 0.010581688955426216, "rewards/rejected": -0.06610327214002609, "sft_loss": 1.1104316711425781, "step": 1100 }, { "epoch": 0.884, "grad_norm": 6.120618129770535, "learning_rate": 4.444694206862929e-06, "logits/chosen": -0.7643265128135681, "logits/rejected": -0.858683705329895, "logps/chosen": -1.2096381187438965, "logps/rejected": -1.3979839086532593, "loss": 1.1843, "odds_ratio_loss": 0.7661855220794678, "rewards/accuracies": 0.5, "rewards/chosen": -0.06048189848661423, "rewards/margins": 0.00941728986799717, "rewards/rejected": -0.06989918649196625, "sft_loss": 1.2096381187438965, "step": 1105 }, { "epoch": 0.888, "grad_norm": 11.782139121174437, "learning_rate": 4.437361221760449e-06, "logits/chosen": -0.5270341634750366, "logits/rejected": -0.7325950264930725, "logps/chosen": -1.3420865535736084, "logps/rejected": -1.8302757740020752, "loss": 1.2491, "odds_ratio_loss": 0.6886317133903503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06710433214902878, "rewards/margins": 0.024409450590610504, "rewards/rejected": -0.09151377528905869, "sft_loss": 1.3420865535736084, "step": 1110 }, { "epoch": 0.892, "grad_norm": 18.68798830441005, "learning_rate": 4.4299862702287255e-06, "logits/chosen": -0.5396694540977478, "logits/rejected": -0.6563701033592224, "logps/chosen": -1.2778229713439941, "logps/rejected": -1.229236364364624, "loss": 1.2489, "odds_ratio_loss": 0.869365394115448, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06389115005731583, "rewards/margins": -0.0024293330498039722, "rewards/rejected": -0.06146181747317314, "sft_loss": 1.2778229713439941, "step": 1115 }, { "epoch": 0.896, "grad_norm": 8.493094923954416, "learning_rate": 4.422569512021332e-06, "logits/chosen": -0.43230313062667847, "logits/rejected": -1.0870712995529175, "logps/chosen": -1.1300268173217773, "logps/rejected": -1.6185880899429321, "loss": 1.186, "odds_ratio_loss": 0.4703669548034668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05650133639574051, "rewards/margins": 0.02442805841565132, "rewards/rejected": -0.08092939108610153, "sft_loss": 1.1300268173217773, "step": 1120 }, { "epoch": 0.9, "grad_norm": 5.267928354352151, "learning_rate": 4.415111107797445e-06, "logits/chosen": -0.5481151342391968, "logits/rejected": -0.9923849105834961, "logps/chosen": -1.230312466621399, "logps/rejected": -1.5415050983428955, "loss": 1.2303, "odds_ratio_loss": 0.5594199299812317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.061515629291534424, "rewards/margins": 0.015559625811874866, "rewards/rejected": -0.07707525044679642, "sft_loss": 1.230312466621399, "step": 1125 }, { "epoch": 0.904, "grad_norm": 8.086520043398188, "learning_rate": 4.407611219118363e-06, "logits/chosen": -0.3550862669944763, "logits/rejected": -0.8939773440361023, "logps/chosen": -1.110655665397644, "logps/rejected": -1.5090737342834473, "loss": 1.2223, "odds_ratio_loss": 0.6366047263145447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0555327832698822, "rewards/margins": 0.01992090605199337, "rewards/rejected": -0.07545368373394012, "sft_loss": 1.110655665397644, "step": 1130 }, { "epoch": 0.908, "grad_norm": 8.346110146833315, "learning_rate": 4.4000700084440046e-06, "logits/chosen": -0.468745619058609, "logits/rejected": -1.0615545511245728, "logps/chosen": -1.219507098197937, "logps/rejected": -1.5035933256149292, "loss": 1.2581, "odds_ratio_loss": 0.9526287317276001, "rewards/accuracies": 0.5, "rewards/chosen": -0.06097535416483879, "rewards/margins": 0.014204311184585094, "rewards/rejected": -0.07517966628074646, "sft_loss": 1.219507098197937, "step": 1135 }, { "epoch": 0.912, "grad_norm": 21.962006553169825, "learning_rate": 4.3924876391293915e-06, "logits/chosen": -0.5484793782234192, "logits/rejected": -0.46796178817749023, "logps/chosen": -1.108927845954895, "logps/rejected": -1.0250526666641235, "loss": 1.2947, "odds_ratio_loss": 0.8142274618148804, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05544638633728027, "rewards/margins": -0.004193754401057959, "rewards/rejected": -0.05125263333320618, "sft_loss": 1.108927845954895, "step": 1140 }, { "epoch": 0.916, "grad_norm": 15.332345311015494, "learning_rate": 4.384864275421109e-06, "logits/chosen": -0.5269834399223328, "logits/rejected": -0.967495322227478, "logps/chosen": -1.288408875465393, "logps/rejected": -1.6966063976287842, "loss": 1.2474, "odds_ratio_loss": 0.5316591262817383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06442044675350189, "rewards/margins": 0.020409878343343735, "rewards/rejected": -0.08483032882213593, "sft_loss": 1.288408875465393, "step": 1145 }, { "epoch": 0.92, "grad_norm": 10.130919066814135, "learning_rate": 4.377200082453748e-06, "logits/chosen": -0.5598348379135132, "logits/rejected": -0.8047167062759399, "logps/chosen": -1.2670339345932007, "logps/rejected": -1.4398491382598877, "loss": 1.2942, "odds_ratio_loss": 0.6326471567153931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06335169821977615, "rewards/margins": 0.008640759624540806, "rewards/rejected": -0.07199246436357498, "sft_loss": 1.2670339345932007, "step": 1150 }, { "epoch": 0.924, "grad_norm": 6.4155914839593855, "learning_rate": 4.36949522624633e-06, "logits/chosen": -0.8152793645858765, "logits/rejected": -0.6962675452232361, "logps/chosen": -1.3344051837921143, "logps/rejected": -1.4677374362945557, "loss": 1.2375, "odds_ratio_loss": 0.715882420539856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06672026216983795, "rewards/margins": 0.006666617002338171, "rewards/rejected": -0.07338687032461166, "sft_loss": 1.3344051837921143, "step": 1155 }, { "epoch": 0.928, "grad_norm": 7.363210525745847, "learning_rate": 4.361749873698707e-06, "logits/chosen": -0.5409479737281799, "logits/rejected": -0.722708523273468, "logps/chosen": -1.1098625659942627, "logps/rejected": -1.3639353513717651, "loss": 1.2489, "odds_ratio_loss": 0.7049247026443481, "rewards/accuracies": 0.5, "rewards/chosen": -0.05549313500523567, "rewards/margins": 0.012703632935881615, "rewards/rejected": -0.06819676607847214, "sft_loss": 1.1098625659942627, "step": 1160 }, { "epoch": 0.932, "grad_norm": 10.93689863299814, "learning_rate": 4.353964192587949e-06, "logits/chosen": -0.7145729064941406, "logits/rejected": -0.849127471446991, "logps/chosen": -1.3707650899887085, "logps/rejected": -1.4907456636428833, "loss": 1.2284, "odds_ratio_loss": 0.6671477556228638, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06853825598955154, "rewards/margins": 0.005999024957418442, "rewards/rejected": -0.07453727722167969, "sft_loss": 1.3707650899887085, "step": 1165 }, { "epoch": 0.936, "grad_norm": 6.2833560952473375, "learning_rate": 4.346138351564711e-06, "logits/chosen": -0.5584260821342468, "logits/rejected": -1.0391467809677124, "logps/chosen": -1.1828453540802002, "logps/rejected": -1.3176859617233276, "loss": 1.2247, "odds_ratio_loss": 0.6230691075325012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05914226919412613, "rewards/margins": 0.006742039229720831, "rewards/rejected": -0.0658842995762825, "sft_loss": 1.1828453540802002, "step": 1170 }, { "epoch": 0.94, "grad_norm": 7.981319588041361, "learning_rate": 4.338272520149572e-06, "logits/chosen": -0.48743829131126404, "logits/rejected": -0.8333004713058472, "logps/chosen": -1.7361185550689697, "logps/rejected": -1.4733507633209229, "loss": 1.2669, "odds_ratio_loss": 0.959330677986145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08680592477321625, "rewards/margins": -0.013138381764292717, "rewards/rejected": -0.07366754859685898, "sft_loss": 1.7361185550689697, "step": 1175 }, { "epoch": 0.944, "grad_norm": 7.109380757572563, "learning_rate": 4.330366868729376e-06, "logits/chosen": -0.22841350734233856, "logits/rejected": -0.941692054271698, "logps/chosen": -1.0154824256896973, "logps/rejected": -1.5618747472763062, "loss": 1.2833, "odds_ratio_loss": 0.5690563321113586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.050774119794368744, "rewards/margins": 0.02731963060796261, "rewards/rejected": -0.0780937448143959, "sft_loss": 1.0154824256896973, "step": 1180 }, { "epoch": 0.948, "grad_norm": 13.318206186445765, "learning_rate": 4.322421568553529e-06, "logits/chosen": -0.38624149560928345, "logits/rejected": -1.0594637393951416, "logps/chosen": -1.07871413230896, "logps/rejected": -1.562281847000122, "loss": 1.2659, "odds_ratio_loss": 0.5322721600532532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.053935706615448, "rewards/margins": 0.02417839504778385, "rewards/rejected": -0.0781140998005867, "sft_loss": 1.07871413230896, "step": 1185 }, { "epoch": 0.952, "grad_norm": 9.11517027692733, "learning_rate": 4.3144367917302964e-06, "logits/chosen": -0.8500941395759583, "logits/rejected": -0.7091385722160339, "logps/chosen": -1.1739693880081177, "logps/rejected": -1.5754871368408203, "loss": 1.2802, "odds_ratio_loss": 0.6279300451278687, "rewards/accuracies": 0.5, "rewards/chosen": -0.058698467910289764, "rewards/margins": 0.02007589302957058, "rewards/rejected": -0.0787743553519249, "sft_loss": 1.1739693880081177, "step": 1190 }, { "epoch": 0.956, "grad_norm": 7.697301408661275, "learning_rate": 4.30641271122307e-06, "logits/chosen": -0.4521522521972656, "logits/rejected": -0.8235799670219421, "logps/chosen": -1.2677797079086304, "logps/rejected": -1.3189507722854614, "loss": 1.2587, "odds_ratio_loss": 0.691112756729126, "rewards/accuracies": 0.5, "rewards/chosen": -0.06338898837566376, "rewards/margins": 0.002558555454015732, "rewards/rejected": -0.06594754755496979, "sft_loss": 1.2677797079086304, "step": 1195 }, { "epoch": 0.96, "grad_norm": 9.997266919643533, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -0.8174387216567993, "logits/rejected": -0.7150999307632446, "logps/chosen": -0.8913282155990601, "logps/rejected": -1.1885263919830322, "loss": 1.2154, "odds_ratio_loss": 0.6406058073043823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04456641152501106, "rewards/margins": 0.014859904535114765, "rewards/rejected": -0.05942631885409355, "sft_loss": 0.8913282155990601, "step": 1200 }, { "epoch": 0.964, "grad_norm": 5.221880235557841, "learning_rate": 4.290247335263362e-06, "logits/chosen": -1.0014302730560303, "logits/rejected": -0.9239674806594849, "logps/chosen": -1.0359283685684204, "logps/rejected": -1.3768165111541748, "loss": 1.2759, "odds_ratio_loss": 0.6447513103485107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05179642513394356, "rewards/margins": 0.017044400796294212, "rewards/rejected": -0.06884082406759262, "sft_loss": 1.0359283685684204, "step": 1205 }, { "epoch": 0.968, "grad_norm": 6.187170258551322, "learning_rate": 4.2821063899795015e-06, "logits/chosen": -0.39787670969963074, "logits/rejected": -1.0457478761672974, "logps/chosen": -1.1205229759216309, "logps/rejected": -1.4367693662643433, "loss": 1.2251, "odds_ratio_loss": 0.5175265073776245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0560261532664299, "rewards/margins": 0.015812328085303307, "rewards/rejected": -0.07183848321437836, "sft_loss": 1.1205229759216309, "step": 1210 }, { "epoch": 0.972, "grad_norm": 5.076678498483747, "learning_rate": 4.273926841341303e-06, "logits/chosen": -0.9177725911140442, "logits/rejected": -0.9428675770759583, "logps/chosen": -1.2810909748077393, "logps/rejected": -1.7306153774261475, "loss": 1.2798, "odds_ratio_loss": 0.6016807556152344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06405454128980637, "rewards/margins": 0.02247622422873974, "rewards/rejected": -0.08653075993061066, "sft_loss": 1.2810909748077393, "step": 1215 }, { "epoch": 0.976, "grad_norm": 6.982415041589051, "learning_rate": 4.265708866531238e-06, "logits/chosen": -0.8390816450119019, "logits/rejected": -0.5559083223342896, "logps/chosen": -0.9034653902053833, "logps/rejected": -1.1532642841339111, "loss": 1.2121, "odds_ratio_loss": 0.5824159383773804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0451732762157917, "rewards/margins": 0.01248995028436184, "rewards/rejected": -0.057663220912218094, "sft_loss": 0.9034653902053833, "step": 1220 }, { "epoch": 0.98, "grad_norm": 8.854119024299864, "learning_rate": 4.257452643564155e-06, "logits/chosen": -0.40446987748146057, "logits/rejected": -0.9087270498275757, "logps/chosen": -0.8603678941726685, "logps/rejected": -1.3758715391159058, "loss": 1.1785, "odds_ratio_loss": 0.416604608297348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0430183932185173, "rewards/margins": 0.025775188580155373, "rewards/rejected": -0.06879357993602753, "sft_loss": 0.8603678941726685, "step": 1225 }, { "epoch": 0.984, "grad_norm": 8.750528387170686, "learning_rate": 4.249158351283414e-06, "logits/chosen": -1.0308736562728882, "logits/rejected": -0.8786516189575195, "logps/chosen": -0.9902039766311646, "logps/rejected": -1.3153424263000488, "loss": 1.2155, "odds_ratio_loss": 0.4901192784309387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04951019585132599, "rewards/margins": 0.016256922855973244, "rewards/rejected": -0.06576712429523468, "sft_loss": 0.9902039766311646, "step": 1230 }, { "epoch": 0.988, "grad_norm": 10.866038696658444, "learning_rate": 4.240826169357024e-06, "logits/chosen": -1.0902023315429688, "logits/rejected": -1.481338620185852, "logps/chosen": -1.419777750968933, "logps/rejected": -1.2114101648330688, "loss": 1.3133, "odds_ratio_loss": 0.9872404932975769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07098889350891113, "rewards/margins": -0.01041838526725769, "rewards/rejected": -0.06057050824165344, "sft_loss": 1.419777750968933, "step": 1235 }, { "epoch": 0.992, "grad_norm": 7.937224210186921, "learning_rate": 4.232456278273743e-06, "logits/chosen": -0.6265432834625244, "logits/rejected": -0.510567843914032, "logps/chosen": -1.0876511335372925, "logps/rejected": -1.4437576532363892, "loss": 1.1532, "odds_ratio_loss": 0.5487557649612427, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.054382555186748505, "rewards/margins": 0.017805328592658043, "rewards/rejected": -0.0721878856420517, "sft_loss": 1.0876511335372925, "step": 1240 }, { "epoch": 0.996, "grad_norm": 18.05729223024994, "learning_rate": 4.224048859339175e-06, "logits/chosen": -0.4967614710330963, "logits/rejected": -0.7329981327056885, "logps/chosen": -1.0347087383270264, "logps/rejected": -1.6826976537704468, "loss": 1.3198, "odds_ratio_loss": 0.5197392702102661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.051735442131757736, "rewards/margins": 0.03239942714571953, "rewards/rejected": -0.08413486927747726, "sft_loss": 1.0347087383270264, "step": 1245 }, { "epoch": 1.0, "grad_norm": 8.219444479887624, "learning_rate": 4.215604094671835e-06, "logits/chosen": -0.5567103624343872, "logits/rejected": -0.5455074310302734, "logps/chosen": -1.3113296031951904, "logps/rejected": -1.5509188175201416, "loss": 1.1873, "odds_ratio_loss": 0.6092044711112976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06556646525859833, "rewards/margins": 0.01197946909815073, "rewards/rejected": -0.07754594087600708, "sft_loss": 1.3113296031951904, "step": 1250 }, { "epoch": 1.004, "grad_norm": 6.382865998528307, "learning_rate": 4.207122167199209e-06, "logits/chosen": -0.6577066779136658, "logits/rejected": -0.7173200845718384, "logps/chosen": -0.8776922225952148, "logps/rejected": -1.7570756673812866, "loss": 0.8296, "odds_ratio_loss": 0.33190062642097473, "rewards/accuracies": 1.0, "rewards/chosen": -0.04388461261987686, "rewards/margins": 0.04396916925907135, "rewards/rejected": -0.08785378187894821, "sft_loss": 0.8776922225952148, "step": 1255 }, { "epoch": 1.008, "grad_norm": 5.105708309165987, "learning_rate": 4.198603260653792e-06, "logits/chosen": -0.5839263796806335, "logits/rejected": -0.866306483745575, "logps/chosen": -0.8336095809936523, "logps/rejected": -1.26575767993927, "loss": 0.8358, "odds_ratio_loss": 0.46527212858200073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.041680481284856796, "rewards/margins": 0.021607402712106705, "rewards/rejected": -0.0632878839969635, "sft_loss": 0.8336095809936523, "step": 1260 }, { "epoch": 1.012, "grad_norm": 4.8302181441829966, "learning_rate": 4.1900475595691044e-06, "logits/chosen": -0.599847674369812, "logits/rejected": -1.0592182874679565, "logps/chosen": -0.5498633980751038, "logps/rejected": -1.3975508213043213, "loss": 0.7752, "odds_ratio_loss": 0.25286543369293213, "rewards/accuracies": 1.0, "rewards/chosen": -0.027493169531226158, "rewards/margins": 0.042384374886751175, "rewards/rejected": -0.06987754255533218, "sft_loss": 0.5498633980751038, "step": 1265 }, { "epoch": 1.016, "grad_norm": 6.056131985323491, "learning_rate": 4.181455249275701e-06, "logits/chosen": -0.2955573499202728, "logits/rejected": -0.5213496088981628, "logps/chosen": -0.7416674494743347, "logps/rejected": -0.9686793088912964, "loss": 0.8138, "odds_ratio_loss": 0.6542133092880249, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.037083376199007034, "rewards/margins": 0.011350592598319054, "rewards/rejected": -0.04843396693468094, "sft_loss": 0.7416674494743347, "step": 1270 }, { "epoch": 1.02, "grad_norm": 5.064697244932875, "learning_rate": 4.172826515897146e-06, "logits/chosen": -0.6576083302497864, "logits/rejected": -0.812027096748352, "logps/chosen": -0.6591814160346985, "logps/rejected": -1.5505030155181885, "loss": 0.7844, "odds_ratio_loss": 0.30150288343429565, "rewards/accuracies": 1.0, "rewards/chosen": -0.03295907378196716, "rewards/margins": 0.0445660725235939, "rewards/rejected": -0.07752513885498047, "sft_loss": 0.6591814160346985, "step": 1275 }, { "epoch": 1.024, "grad_norm": 5.807191452802592, "learning_rate": 4.1641615463459926e-06, "logits/chosen": -0.4745884835720062, "logits/rejected": -0.4976702332496643, "logps/chosen": -0.6141924858093262, "logps/rejected": -1.0756144523620605, "loss": 0.7885, "odds_ratio_loss": 0.458048015832901, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03070961870253086, "rewards/margins": 0.023071100935339928, "rewards/rejected": -0.05378072336316109, "sft_loss": 0.6141924858093262, "step": 1280 }, { "epoch": 1.028, "grad_norm": 5.371739030260733, "learning_rate": 4.1554605283197255e-06, "logits/chosen": -0.8033881187438965, "logits/rejected": -0.8408336639404297, "logps/chosen": -0.478267103433609, "logps/rejected": -1.3151143789291382, "loss": 0.8676, "odds_ratio_loss": 0.30922359228134155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02391335740685463, "rewards/margins": 0.04184236004948616, "rewards/rejected": -0.06575571000576019, "sft_loss": 0.478267103433609, "step": 1285 }, { "epoch": 1.032, "grad_norm": 6.568180405098848, "learning_rate": 4.146723650296701e-06, "logits/chosen": -0.6499618887901306, "logits/rejected": -0.931594967842102, "logps/chosen": -1.0891497135162354, "logps/rejected": -1.2812997102737427, "loss": 0.9537, "odds_ratio_loss": 0.7874979972839355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05445748567581177, "rewards/margins": 0.009607489220798016, "rewards/rejected": -0.06406497955322266, "sft_loss": 1.0891497135162354, "step": 1290 }, { "epoch": 1.036, "grad_norm": 8.474892928899138, "learning_rate": 4.1379511015320625e-06, "logits/chosen": -0.4418262839317322, "logits/rejected": -0.8559747934341431, "logps/chosen": -0.9219955205917358, "logps/rejected": -1.7327539920806885, "loss": 0.8688, "odds_ratio_loss": 0.3014797568321228, "rewards/accuracies": 1.0, "rewards/chosen": -0.04609977453947067, "rewards/margins": 0.04053793102502823, "rewards/rejected": -0.0866376981139183, "sft_loss": 0.9219955205917358, "step": 1295 }, { "epoch": 1.04, "grad_norm": 5.782082477060897, "learning_rate": 4.129143072053639e-06, "logits/chosen": -0.7809382677078247, "logits/rejected": -0.8675028085708618, "logps/chosen": -0.8315264582633972, "logps/rejected": -1.3996779918670654, "loss": 0.899, "odds_ratio_loss": 0.3759022355079651, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0415763296186924, "rewards/margins": 0.02840757742524147, "rewards/rejected": -0.06998389959335327, "sft_loss": 0.8315264582633972, "step": 1300 }, { "epoch": 1.044, "grad_norm": 6.243618609939396, "learning_rate": 4.120299752657828e-06, "logits/chosen": -0.7213522791862488, "logits/rejected": -0.5738562345504761, "logps/chosen": -0.927670955657959, "logps/rejected": -1.4523080587387085, "loss": 0.8292, "odds_ratio_loss": 0.45550140738487244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04638354852795601, "rewards/margins": 0.026231860741972923, "rewards/rejected": -0.07261540740728378, "sft_loss": 0.927670955657959, "step": 1305 }, { "epoch": 1.048, "grad_norm": 9.073750027816232, "learning_rate": 4.111421334905468e-06, "logits/chosen": -0.6499530673027039, "logits/rejected": -0.8254686594009399, "logps/chosen": -0.9249345660209656, "logps/rejected": -1.39388906955719, "loss": 0.7753, "odds_ratio_loss": 0.43550366163253784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0462467297911644, "rewards/margins": 0.023447733372449875, "rewards/rejected": -0.06969445943832397, "sft_loss": 0.9249345660209656, "step": 1310 }, { "epoch": 1.052, "grad_norm": 10.11337598302246, "learning_rate": 4.102508011117684e-06, "logits/chosen": -0.44864311814308167, "logits/rejected": -0.7308858633041382, "logps/chosen": -0.9878190755844116, "logps/rejected": -1.7209956645965576, "loss": 0.7796, "odds_ratio_loss": 0.357673704624176, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04939096421003342, "rewards/margins": 0.03665883094072342, "rewards/rejected": -0.08604978024959564, "sft_loss": 0.9878190755844116, "step": 1315 }, { "epoch": 1.056, "grad_norm": 4.673425946179824, "learning_rate": 4.093559974371725e-06, "logits/chosen": -0.6861502528190613, "logits/rejected": -0.45734572410583496, "logps/chosen": -0.5693352818489075, "logps/rejected": -1.291886568069458, "loss": 0.8203, "odds_ratio_loss": 0.2426222562789917, "rewards/accuracies": 1.0, "rewards/chosen": -0.028466764837503433, "rewards/margins": 0.036127571016550064, "rewards/rejected": -0.0645943284034729, "sft_loss": 0.5693352818489075, "step": 1320 }, { "epoch": 1.06, "grad_norm": 5.847314097304619, "learning_rate": 4.084577418496775e-06, "logits/chosen": -0.8275073766708374, "logits/rejected": -0.6742109060287476, "logps/chosen": -0.5419312119483948, "logps/rejected": -1.4742541313171387, "loss": 0.8365, "odds_ratio_loss": 0.3019058406352997, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02709656022489071, "rewards/margins": 0.046616148203611374, "rewards/rejected": -0.07371271401643753, "sft_loss": 0.5419312119483948, "step": 1325 }, { "epoch": 1.064, "grad_norm": 4.856747702455789, "learning_rate": 4.075560538069767e-06, "logits/chosen": -0.5532726049423218, "logits/rejected": -0.6589607000350952, "logps/chosen": -0.7456861734390259, "logps/rejected": -1.2630527019500732, "loss": 0.8495, "odds_ratio_loss": 0.54130619764328, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03728431463241577, "rewards/margins": 0.025868326425552368, "rewards/rejected": -0.06315263360738754, "sft_loss": 0.7456861734390259, "step": 1330 }, { "epoch": 1.068, "grad_norm": 7.765532059632631, "learning_rate": 4.066509528411151e-06, "logits/chosen": -0.5705880522727966, "logits/rejected": -0.7791796922683716, "logps/chosen": -0.867768406867981, "logps/rejected": -1.508888602256775, "loss": 0.8959, "odds_ratio_loss": 0.4603050649166107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04338841885328293, "rewards/margins": 0.032056018710136414, "rewards/rejected": -0.07544443756341934, "sft_loss": 0.867768406867981, "step": 1335 }, { "epoch": 1.072, "grad_norm": 12.916315689810858, "learning_rate": 4.05742458558068e-06, "logits/chosen": -0.8875905275344849, "logits/rejected": -0.7213624119758606, "logps/chosen": -0.7063192129135132, "logps/rejected": -1.434460997581482, "loss": 0.7654, "odds_ratio_loss": 0.3284406065940857, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03531596064567566, "rewards/margins": 0.03640709072351456, "rewards/rejected": -0.07172305136919022, "sft_loss": 0.7063192129135132, "step": 1340 }, { "epoch": 1.076, "grad_norm": 7.30216329780713, "learning_rate": 4.048305906373151e-06, "logits/chosen": -0.6724140644073486, "logits/rejected": -0.686550498008728, "logps/chosen": -0.9202359914779663, "logps/rejected": -1.229242205619812, "loss": 0.8455, "odds_ratio_loss": 0.5194590091705322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.046011798083782196, "rewards/margins": 0.015450313687324524, "rewards/rejected": -0.06146211549639702, "sft_loss": 0.9202359914779663, "step": 1345 }, { "epoch": 1.08, "grad_norm": 7.376368865831852, "learning_rate": 4.039153688314146e-06, "logits/chosen": -0.6789649724960327, "logits/rejected": -0.7698289155960083, "logps/chosen": -1.0499086380004883, "logps/rejected": -1.4064363241195679, "loss": 0.8224, "odds_ratio_loss": 0.5207637548446655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05249543860554695, "rewards/margins": 0.017826389521360397, "rewards/rejected": -0.07032182067632675, "sft_loss": 1.0499086380004883, "step": 1350 }, { "epoch": 1.084, "grad_norm": 10.69540570354578, "learning_rate": 4.029968129655757e-06, "logits/chosen": -0.7770857214927673, "logits/rejected": -0.9475702047348022, "logps/chosen": -0.7179058790206909, "logps/rejected": -1.5098683834075928, "loss": 0.8206, "odds_ratio_loss": 0.33577170968055725, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.035895295441150665, "rewards/margins": 0.03959812596440315, "rewards/rejected": -0.07549341022968292, "sft_loss": 0.7179058790206909, "step": 1355 }, { "epoch": 1.088, "grad_norm": 8.204731502525116, "learning_rate": 4.020749429372286e-06, "logits/chosen": -0.851569652557373, "logits/rejected": -0.98048335313797, "logps/chosen": -0.7297436594963074, "logps/rejected": -1.4111974239349365, "loss": 0.8462, "odds_ratio_loss": 0.3803647458553314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.036487188190221786, "rewards/margins": 0.03407268971204758, "rewards/rejected": -0.07055987417697906, "sft_loss": 0.7297436594963074, "step": 1360 }, { "epoch": 1.092, "grad_norm": 8.235008694291277, "learning_rate": 4.011497787155938e-06, "logits/chosen": -0.7461971044540405, "logits/rejected": -1.4180670976638794, "logps/chosen": -0.9519845843315125, "logps/rejected": -1.9593369960784912, "loss": 0.9038, "odds_ratio_loss": 0.356905996799469, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04759923368692398, "rewards/margins": 0.050367631018161774, "rewards/rejected": -0.09796686470508575, "sft_loss": 0.9519845843315125, "step": 1365 }, { "epoch": 1.096, "grad_norm": 15.389585033573624, "learning_rate": 4.002213403412492e-06, "logits/chosen": -0.6717543005943298, "logits/rejected": -0.9089921116828918, "logps/chosen": -1.0161564350128174, "logps/rejected": -1.3220138549804688, "loss": 0.8947, "odds_ratio_loss": 0.5496761202812195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05080781504511833, "rewards/margins": 0.01529287826269865, "rewards/rejected": -0.06610070168972015, "sft_loss": 1.0161564350128174, "step": 1370 }, { "epoch": 1.1, "grad_norm": 7.445613929034652, "learning_rate": 3.992896479256966e-06, "logits/chosen": -0.680860161781311, "logits/rejected": -0.6606763601303101, "logps/chosen": -0.6550648808479309, "logps/rejected": -1.352782964706421, "loss": 0.8223, "odds_ratio_loss": 0.3205656111240387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.032753244042396545, "rewards/margins": 0.03488590195775032, "rewards/rejected": -0.06763914227485657, "sft_loss": 0.6550648808479309, "step": 1375 }, { "epoch": 1.104, "grad_norm": 5.348676916256749, "learning_rate": 3.983547216509254e-06, "logits/chosen": -0.5789454579353333, "logits/rejected": -1.2544965744018555, "logps/chosen": -0.796095609664917, "logps/rejected": -1.5642998218536377, "loss": 0.7846, "odds_ratio_loss": 0.32175213098526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03980477899312973, "rewards/margins": 0.038410209119319916, "rewards/rejected": -0.07821498811244965, "sft_loss": 0.796095609664917, "step": 1380 }, { "epoch": 1.108, "grad_norm": 11.505326821901532, "learning_rate": 3.974165817689758e-06, "logits/chosen": -0.956534206867218, "logits/rejected": -1.1092783212661743, "logps/chosen": -1.0736840963363647, "logps/rejected": -1.5605031251907349, "loss": 0.8767, "odds_ratio_loss": 0.49855270981788635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05368421599268913, "rewards/margins": 0.02434094250202179, "rewards/rejected": -0.07802516222000122, "sft_loss": 1.0736840963363647, "step": 1385 }, { "epoch": 1.112, "grad_norm": 7.11996683938393, "learning_rate": 3.964752486015001e-06, "logits/chosen": -0.702655553817749, "logits/rejected": -1.0277684926986694, "logps/chosen": -0.8141889572143555, "logps/rejected": -1.9155117273330688, "loss": 0.7498, "odds_ratio_loss": 0.26831483840942383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04070945456624031, "rewards/margins": 0.05506613105535507, "rewards/rejected": -0.09577558934688568, "sft_loss": 0.8141889572143555, "step": 1390 }, { "epoch": 1.116, "grad_norm": 5.797590260170669, "learning_rate": 3.955307425393224e-06, "logits/chosen": -0.7000407576560974, "logits/rejected": -0.9680187106132507, "logps/chosen": -0.9160090684890747, "logps/rejected": -1.364323616027832, "loss": 0.8904, "odds_ratio_loss": 0.45110782980918884, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.045800451189279556, "rewards/margins": 0.022415729239583015, "rewards/rejected": -0.06821618974208832, "sft_loss": 0.9160090684890747, "step": 1395 }, { "epoch": 1.12, "grad_norm": 8.433254983665345, "learning_rate": 3.945830840419966e-06, "logits/chosen": -0.7009550929069519, "logits/rejected": -1.0051246881484985, "logps/chosen": -0.4989989697933197, "logps/rejected": -1.407712697982788, "loss": 0.7925, "odds_ratio_loss": 0.27720946073532104, "rewards/accuracies": 1.0, "rewards/chosen": -0.024949947372078896, "rewards/margins": 0.045435696840286255, "rewards/rejected": -0.07038564234972, "sft_loss": 0.4989989697933197, "step": 1400 }, { "epoch": 1.124, "grad_norm": 12.99123162983583, "learning_rate": 3.936322936373641e-06, "logits/chosen": -0.6947557330131531, "logits/rejected": -0.905806839466095, "logps/chosen": -0.9586040377616882, "logps/rejected": -1.2324825525283813, "loss": 0.8352, "odds_ratio_loss": 0.5849367380142212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04793020337820053, "rewards/margins": 0.013693928718566895, "rewards/rejected": -0.06162412837147713, "sft_loss": 0.9586040377616882, "step": 1405 }, { "epoch": 1.1280000000000001, "grad_norm": 8.170518488538953, "learning_rate": 3.92678391921108e-06, "logits/chosen": -1.1551212072372437, "logits/rejected": -0.8484439849853516, "logps/chosen": -0.5776058435440063, "logps/rejected": -1.4134206771850586, "loss": 0.8291, "odds_ratio_loss": 0.31984660029411316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.028880292549729347, "rewards/margins": 0.04179074615240097, "rewards/rejected": -0.07067102938890457, "sft_loss": 0.5776058435440063, "step": 1410 }, { "epoch": 1.1320000000000001, "grad_norm": 8.09496861727856, "learning_rate": 3.9172139955630774e-06, "logits/chosen": -0.713034987449646, "logits/rejected": -0.9983466863632202, "logps/chosen": -0.6458183526992798, "logps/rejected": -1.828302025794983, "loss": 0.866, "odds_ratio_loss": 0.30388593673706055, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03229091316461563, "rewards/margins": 0.0591241791844368, "rewards/rejected": -0.09141509234905243, "sft_loss": 0.6458183526992798, "step": 1415 }, { "epoch": 1.1360000000000001, "grad_norm": 16.205423769664655, "learning_rate": 3.907613372729916e-06, "logits/chosen": -0.6726616024971008, "logits/rejected": -1.3900980949401855, "logps/chosen": -0.7454610466957092, "logps/rejected": -2.1414051055908203, "loss": 0.8471, "odds_ratio_loss": 0.2859545350074768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03727305680513382, "rewards/margins": 0.06979719549417496, "rewards/rejected": -0.10707025229930878, "sft_loss": 0.7454610466957092, "step": 1420 }, { "epoch": 1.1400000000000001, "grad_norm": 10.269281839827107, "learning_rate": 3.897982258676867e-06, "logits/chosen": -0.7214896082878113, "logits/rejected": -0.8336132168769836, "logps/chosen": -0.9054323434829712, "logps/rejected": -2.1648507118225098, "loss": 0.9165, "odds_ratio_loss": 0.38728705048561096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0452716164290905, "rewards/margins": 0.06297094374895096, "rewards/rejected": -0.10824254900217056, "sft_loss": 0.9054323434829712, "step": 1425 }, { "epoch": 1.144, "grad_norm": 6.605575368297014, "learning_rate": 3.888320862029699e-06, "logits/chosen": -0.5836726427078247, "logits/rejected": -0.7385787963867188, "logps/chosen": -0.9469176530838013, "logps/rejected": -1.4403495788574219, "loss": 0.8601, "odds_ratio_loss": 0.4318571984767914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04734588414430618, "rewards/margins": 0.02467160113155842, "rewards/rejected": -0.07201748341321945, "sft_loss": 0.9469176530838013, "step": 1430 }, { "epoch": 1.148, "grad_norm": 5.846266947718309, "learning_rate": 3.878629392070143e-06, "logits/chosen": -0.7644879817962646, "logits/rejected": -0.9813248515129089, "logps/chosen": -0.8727389574050903, "logps/rejected": -1.5859434604644775, "loss": 0.8655, "odds_ratio_loss": 0.47635746002197266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.043636951595544815, "rewards/margins": 0.03566022962331772, "rewards/rejected": -0.07929717004299164, "sft_loss": 0.8727389574050903, "step": 1435 }, { "epoch": 1.152, "grad_norm": 8.026595177727547, "learning_rate": 3.868908058731376e-06, "logits/chosen": -1.0093854665756226, "logits/rejected": -0.8972814679145813, "logps/chosen": -0.641372799873352, "logps/rejected": -1.255726933479309, "loss": 0.7964, "odds_ratio_loss": 0.33428263664245605, "rewards/accuracies": 1.0, "rewards/chosen": -0.0320686399936676, "rewards/margins": 0.030717704445123672, "rewards/rejected": -0.06278634071350098, "sft_loss": 0.641372799873352, "step": 1440 }, { "epoch": 1.156, "grad_norm": 7.131166323627883, "learning_rate": 3.859157072593459e-06, "logits/chosen": -0.8095647096633911, "logits/rejected": -1.309104323387146, "logps/chosen": -0.9154605865478516, "logps/rejected": -1.6045910120010376, "loss": 0.8756, "odds_ratio_loss": 0.3739989399909973, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04577303305268288, "rewards/margins": 0.0344565287232399, "rewards/rejected": -0.08022955805063248, "sft_loss": 0.9154605865478516, "step": 1445 }, { "epoch": 1.16, "grad_norm": 11.191454450304596, "learning_rate": 3.849376644878783e-06, "logits/chosen": -0.7892749905586243, "logits/rejected": -0.7882648706436157, "logps/chosen": -1.122315764427185, "logps/rejected": -1.2966439723968506, "loss": 0.8137, "odds_ratio_loss": 0.6968892812728882, "rewards/accuracies": 0.5, "rewards/chosen": -0.05611578747630119, "rewards/margins": 0.008716410025954247, "rewards/rejected": -0.06483219563961029, "sft_loss": 1.122315764427185, "step": 1450 }, { "epoch": 1.164, "grad_norm": 9.51974427968147, "learning_rate": 3.839566987447492e-06, "logits/chosen": -1.2010838985443115, "logits/rejected": -0.9590059518814087, "logps/chosen": -0.6959027051925659, "logps/rejected": -1.873453140258789, "loss": 0.831, "odds_ratio_loss": 0.36710458993911743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0347951278090477, "rewards/margins": 0.05887751653790474, "rewards/rejected": -0.09367264807224274, "sft_loss": 0.6959027051925659, "step": 1455 }, { "epoch": 1.168, "grad_norm": 6.6227065501638815, "learning_rate": 3.829728312792895e-06, "logits/chosen": -0.8780848383903503, "logits/rejected": -0.8581530451774597, "logps/chosen": -0.9142974019050598, "logps/rejected": -1.3824703693389893, "loss": 0.78, "odds_ratio_loss": 0.4631693363189697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04571487009525299, "rewards/margins": 0.023408645763993263, "rewards/rejected": -0.0691235214471817, "sft_loss": 0.9142974019050598, "step": 1460 }, { "epoch": 1.172, "grad_norm": 6.363999703267043, "learning_rate": 3.819860834036859e-06, "logits/chosen": -0.5977569222450256, "logits/rejected": -0.7959061861038208, "logps/chosen": -1.1498180627822876, "logps/rejected": -1.42813241481781, "loss": 0.8887, "odds_ratio_loss": 0.5259816646575928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05749090388417244, "rewards/margins": 0.013915717601776123, "rewards/rejected": -0.07140661776065826, "sft_loss": 1.1498180627822876, "step": 1465 }, { "epoch": 1.176, "grad_norm": 6.246391614781854, "learning_rate": 3.8099647649251984e-06, "logits/chosen": -0.7605501413345337, "logits/rejected": -1.1908546686172485, "logps/chosen": -0.8100579380989075, "logps/rejected": -1.775541067123413, "loss": 0.8348, "odds_ratio_loss": 0.42932015657424927, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04050289839506149, "rewards/margins": 0.04827415570616722, "rewards/rejected": -0.08877705037593842, "sft_loss": 0.8100579380989075, "step": 1470 }, { "epoch": 1.18, "grad_norm": 5.893619858051333, "learning_rate": 3.8000403198230385e-06, "logits/chosen": -0.787771463394165, "logits/rejected": -1.0664321184158325, "logps/chosen": -0.8280608057975769, "logps/rejected": -1.8852030038833618, "loss": 0.822, "odds_ratio_loss": 0.3562834858894348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.041403044015169144, "rewards/margins": 0.05285710096359253, "rewards/rejected": -0.09426014125347137, "sft_loss": 0.8280608057975769, "step": 1475 }, { "epoch": 1.184, "grad_norm": 7.183406495752281, "learning_rate": 3.790087713710179e-06, "logits/chosen": -0.7213200330734253, "logits/rejected": -0.6918378472328186, "logps/chosen": -0.5293585658073425, "logps/rejected": -1.8362514972686768, "loss": 0.7973, "odds_ratio_loss": 0.19014953076839447, "rewards/accuracies": 1.0, "rewards/chosen": -0.026467930525541306, "rewards/margins": 0.06534464657306671, "rewards/rejected": -0.09181257337331772, "sft_loss": 0.5293585658073425, "step": 1480 }, { "epoch": 1.188, "grad_norm": 17.119549825566487, "learning_rate": 3.780107162176429e-06, "logits/chosen": -0.6820231676101685, "logits/rejected": -1.1885521411895752, "logps/chosen": -0.7856122255325317, "logps/rejected": -1.2408835887908936, "loss": 0.8931, "odds_ratio_loss": 0.4235461354255676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03928060457110405, "rewards/margins": 0.022763576358556747, "rewards/rejected": -0.0620441809296608, "sft_loss": 0.7856122255325317, "step": 1485 }, { "epoch": 1.192, "grad_norm": 6.561999759540379, "learning_rate": 3.770098881416945e-06, "logits/chosen": -0.8763113021850586, "logits/rejected": -1.155576229095459, "logps/chosen": -0.5533097386360168, "logps/rejected": -1.3931071758270264, "loss": 0.8147, "odds_ratio_loss": 0.2342720478773117, "rewards/accuracies": 1.0, "rewards/chosen": -0.02766549028456211, "rewards/margins": 0.041989874094724655, "rewards/rejected": -0.06965536624193192, "sft_loss": 0.5533097386360168, "step": 1490 }, { "epoch": 1.196, "grad_norm": 5.852667265636353, "learning_rate": 3.760063088227542e-06, "logits/chosen": -0.7204443216323853, "logits/rejected": -1.0452661514282227, "logps/chosen": -0.6752656698226929, "logps/rejected": -1.6966698169708252, "loss": 0.842, "odds_ratio_loss": 0.3053438067436218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.033763282001018524, "rewards/margins": 0.0510701946914196, "rewards/rejected": -0.08483348786830902, "sft_loss": 0.6752656698226929, "step": 1495 }, { "epoch": 1.2, "grad_norm": 7.666952370293206, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.2530326843261719, "logits/rejected": -1.17472243309021, "logps/chosen": -0.8737776875495911, "logps/rejected": -1.5337140560150146, "loss": 0.8295, "odds_ratio_loss": 0.4370272159576416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.043688882142305374, "rewards/margins": 0.032996825873851776, "rewards/rejected": -0.07668570429086685, "sft_loss": 0.8737776875495911, "step": 1500 }, { "epoch": 1.204, "grad_norm": 8.643591741745004, "learning_rate": 3.739909834717356e-06, "logits/chosen": -1.0214110612869263, "logits/rejected": -1.1113232374191284, "logps/chosen": -1.0257785320281982, "logps/rejected": -1.4637627601623535, "loss": 0.84, "odds_ratio_loss": 0.4033835530281067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05128893256187439, "rewards/margins": 0.021899200975894928, "rewards/rejected": -0.07318813353776932, "sft_loss": 1.0257785320281982, "step": 1505 }, { "epoch": 1.208, "grad_norm": 6.581431751480548, "learning_rate": 3.7297928109491765e-06, "logits/chosen": -0.735799252986908, "logits/rejected": -0.750560462474823, "logps/chosen": -0.838580310344696, "logps/rejected": -1.3117971420288086, "loss": 0.8373, "odds_ratio_loss": 0.4745975434780121, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04192901775240898, "rewards/margins": 0.023660842329263687, "rewards/rejected": -0.06558986008167267, "sft_loss": 0.838580310344696, "step": 1510 }, { "epoch": 1.212, "grad_norm": 8.38248048628384, "learning_rate": 3.7196491478468322e-06, "logits/chosen": -0.7478394508361816, "logits/rejected": -0.9995372891426086, "logps/chosen": -0.8661327362060547, "logps/rejected": -1.3012148141860962, "loss": 0.7344, "odds_ratio_loss": 0.4485481381416321, "rewards/accuracies": 1.0, "rewards/chosen": -0.043306637555360794, "rewards/margins": 0.021754100918769836, "rewards/rejected": -0.06506074219942093, "sft_loss": 0.8661327362060547, "step": 1515 }, { "epoch": 1.216, "grad_norm": 7.766215821807629, "learning_rate": 3.7094790651387414e-06, "logits/chosen": -0.7365530729293823, "logits/rejected": -0.8527762293815613, "logps/chosen": -0.9115310907363892, "logps/rejected": -1.5248000621795654, "loss": 0.8285, "odds_ratio_loss": 0.4467516839504242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0455765500664711, "rewards/margins": 0.030663449317216873, "rewards/rejected": -0.07623999565839767, "sft_loss": 0.9115310907363892, "step": 1520 }, { "epoch": 1.22, "grad_norm": 6.4955437270373135, "learning_rate": 3.699282783125616e-06, "logits/chosen": -0.7742820978164673, "logits/rejected": -0.937674880027771, "logps/chosen": -0.8425240516662598, "logps/rejected": -1.2824028730392456, "loss": 0.779, "odds_ratio_loss": 0.4461577534675598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04212620481848717, "rewards/margins": 0.021993935108184814, "rewards/rejected": -0.06412014365196228, "sft_loss": 0.8425240516662598, "step": 1525 }, { "epoch": 1.224, "grad_norm": 6.362003415761398, "learning_rate": 3.689060522675689e-06, "logits/chosen": -0.7660902142524719, "logits/rejected": -1.5068974494934082, "logps/chosen": -0.6618943810462952, "logps/rejected": -1.425063133239746, "loss": 0.845, "odds_ratio_loss": 0.4632844030857086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03309471905231476, "rewards/margins": 0.038158439099788666, "rewards/rejected": -0.07125315815210342, "sft_loss": 0.6618943810462952, "step": 1530 }, { "epoch": 1.228, "grad_norm": 6.895766693871366, "learning_rate": 3.6788125052199264e-06, "logits/chosen": -0.7660204172134399, "logits/rejected": -1.0502755641937256, "logps/chosen": -0.8439900279045105, "logps/rejected": -1.3339447975158691, "loss": 0.8453, "odds_ratio_loss": 0.40678563714027405, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.042199499905109406, "rewards/margins": 0.024497732520103455, "rewards/rejected": -0.06669723987579346, "sft_loss": 0.8439900279045105, "step": 1535 }, { "epoch": 1.232, "grad_norm": 7.095832854877261, "learning_rate": 3.668538952747236e-06, "logits/chosen": -0.8988308906555176, "logits/rejected": -1.370678186416626, "logps/chosen": -0.7767564058303833, "logps/rejected": -0.9345728158950806, "loss": 0.8361, "odds_ratio_loss": 0.6314736008644104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03883781656622887, "rewards/margins": 0.007890825159847736, "rewards/rejected": -0.04672864452004433, "sft_loss": 0.7767564058303833, "step": 1540 }, { "epoch": 1.236, "grad_norm": 15.297702452409231, "learning_rate": 3.658240087799655e-06, "logits/chosen": -1.128368854522705, "logits/rejected": -0.9741285443305969, "logps/chosen": -0.9071061015129089, "logps/rejected": -1.4348928928375244, "loss": 0.8299, "odds_ratio_loss": 0.4918842315673828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04535530507564545, "rewards/margins": 0.026389339938759804, "rewards/rejected": -0.0717446431517601, "sft_loss": 0.9071061015129089, "step": 1545 }, { "epoch": 1.24, "grad_norm": 15.626108640705025, "learning_rate": 3.6479161334675294e-06, "logits/chosen": -0.7529286742210388, "logits/rejected": -0.9413396716117859, "logps/chosen": -0.6087412238121033, "logps/rejected": -1.2803432941436768, "loss": 0.7919, "odds_ratio_loss": 0.30838295817375183, "rewards/accuracies": 1.0, "rewards/chosen": -0.030437063425779343, "rewards/margins": 0.033580102026462555, "rewards/rejected": -0.0640171617269516, "sft_loss": 0.6087412238121033, "step": 1550 }, { "epoch": 1.244, "grad_norm": 7.081745221757999, "learning_rate": 3.6375673133846847e-06, "logits/chosen": -0.8344209790229797, "logits/rejected": -0.9608109593391418, "logps/chosen": -1.0645172595977783, "logps/rejected": -1.4052292108535767, "loss": 0.8832, "odds_ratio_loss": 0.5416877269744873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05322585627436638, "rewards/margins": 0.017035599797964096, "rewards/rejected": -0.07026146352291107, "sft_loss": 1.0645172595977783, "step": 1555 }, { "epoch": 1.248, "grad_norm": 7.328229540786072, "learning_rate": 3.627193851723577e-06, "logits/chosen": -0.8219780921936035, "logits/rejected": -1.0412352085113525, "logps/chosen": -0.8243353962898254, "logps/rejected": -1.1440980434417725, "loss": 0.7661, "odds_ratio_loss": 0.48546719551086426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04121676832437515, "rewards/margins": 0.01598813384771347, "rewards/rejected": -0.05720490217208862, "sft_loss": 0.8243353962898254, "step": 1560 }, { "epoch": 1.252, "grad_norm": 8.481385406459447, "learning_rate": 3.616795973190442e-06, "logits/chosen": -0.6780751943588257, "logits/rejected": -1.5924546718597412, "logps/chosen": -0.7332156896591187, "logps/rejected": -1.6106353998184204, "loss": 0.8628, "odds_ratio_loss": 0.2695372700691223, "rewards/accuracies": 1.0, "rewards/chosen": -0.03666078671813011, "rewards/margins": 0.04387098178267479, "rewards/rejected": -0.0805317685008049, "sft_loss": 0.7332156896591187, "step": 1565 }, { "epoch": 1.256, "grad_norm": 4.8716559657268625, "learning_rate": 3.6063739030204226e-06, "logits/chosen": -0.7873969078063965, "logits/rejected": -1.142225980758667, "logps/chosen": -0.7441651821136475, "logps/rejected": -1.4604860544204712, "loss": 0.7917, "odds_ratio_loss": 0.44381627440452576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03720825910568237, "rewards/margins": 0.03581603989005089, "rewards/rejected": -0.07302430272102356, "sft_loss": 0.7441651821136475, "step": 1570 }, { "epoch": 1.26, "grad_norm": 5.809355093226559, "learning_rate": 3.595927866972694e-06, "logits/chosen": -0.8129979968070984, "logits/rejected": -0.8149601817131042, "logps/chosen": -0.7339099645614624, "logps/rejected": -1.4441230297088623, "loss": 0.8216, "odds_ratio_loss": 0.3208310604095459, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03669549524784088, "rewards/margins": 0.035510655492544174, "rewards/rejected": -0.07220615446567535, "sft_loss": 0.7339099645614624, "step": 1575 }, { "epoch": 1.264, "grad_norm": 6.200919368514555, "learning_rate": 3.5854580913255706e-06, "logits/chosen": -0.5802955627441406, "logits/rejected": -0.772422194480896, "logps/chosen": -1.0532017946243286, "logps/rejected": -1.506858468055725, "loss": 0.8307, "odds_ratio_loss": 0.4608602523803711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05266008898615837, "rewards/margins": 0.022682830691337585, "rewards/rejected": -0.07534292340278625, "sft_loss": 1.0532017946243286, "step": 1580 }, { "epoch": 1.268, "grad_norm": 5.8218405882663085, "learning_rate": 3.574964802871607e-06, "logits/chosen": -0.5340670347213745, "logits/rejected": -1.2797355651855469, "logps/chosen": -0.8620258569717407, "logps/rejected": -1.4560054540634155, "loss": 0.8596, "odds_ratio_loss": 0.3901941776275635, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.043101292103528976, "rewards/margins": 0.029698973521590233, "rewards/rejected": -0.07280026376247406, "sft_loss": 0.8620258569717407, "step": 1585 }, { "epoch": 1.272, "grad_norm": 6.664692745829687, "learning_rate": 3.564448228912682e-06, "logits/chosen": -0.5848423838615417, "logits/rejected": -0.9934035539627075, "logps/chosen": -0.7198655009269714, "logps/rejected": -1.534895896911621, "loss": 0.6926, "odds_ratio_loss": 0.38178950548171997, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03599327430129051, "rewards/margins": 0.04075152799487114, "rewards/rejected": -0.07674480974674225, "sft_loss": 0.7198655009269714, "step": 1590 }, { "epoch": 1.276, "grad_norm": 6.854701270720263, "learning_rate": 3.5539085972550786e-06, "logits/chosen": -0.5101417303085327, "logits/rejected": -1.0358482599258423, "logps/chosen": -0.7137448191642761, "logps/rejected": -1.2877639532089233, "loss": 0.7468, "odds_ratio_loss": 0.388786256313324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.035687245428562164, "rewards/margins": 0.02870095707476139, "rewards/rejected": -0.0643882006406784, "sft_loss": 0.7137448191642761, "step": 1595 }, { "epoch": 1.28, "grad_norm": 6.779174368552459, "learning_rate": 3.543346136204545e-06, "logits/chosen": -0.9765304327011108, "logits/rejected": -0.6466418504714966, "logps/chosen": -0.9283822178840637, "logps/rejected": -1.2504222393035889, "loss": 0.8717, "odds_ratio_loss": 0.5800614953041077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04641910642385483, "rewards/margins": 0.016102004796266556, "rewards/rejected": -0.06252111494541168, "sft_loss": 0.9283822178840637, "step": 1600 }, { "epoch": 1.284, "grad_norm": 9.786348784846165, "learning_rate": 3.532761074561355e-06, "logits/chosen": -0.6300225853919983, "logits/rejected": -1.1493359804153442, "logps/chosen": -1.0870106220245361, "logps/rejected": -1.61102294921875, "loss": 0.801, "odds_ratio_loss": 0.4377533793449402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.054350532591342926, "rewards/margins": 0.026200611144304276, "rewards/rejected": -0.0805511474609375, "sft_loss": 1.0870106220245361, "step": 1605 }, { "epoch": 1.288, "grad_norm": 5.913885782177144, "learning_rate": 3.522153641615345e-06, "logits/chosen": -0.5452396273612976, "logits/rejected": -1.095402479171753, "logps/chosen": -0.9149934649467468, "logps/rejected": -1.1826355457305908, "loss": 0.9447, "odds_ratio_loss": 0.5239506959915161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04574967175722122, "rewards/margins": 0.01338210143148899, "rewards/rejected": -0.05913177877664566, "sft_loss": 0.9149934649467468, "step": 1610 }, { "epoch": 1.292, "grad_norm": 5.896924307562007, "learning_rate": 3.5115240671409534e-06, "logits/chosen": -0.6529924273490906, "logits/rejected": -0.7851907014846802, "logps/chosen": -0.8748595118522644, "logps/rejected": -1.2012172937393188, "loss": 0.8603, "odds_ratio_loss": 0.5565796494483948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04374297708272934, "rewards/margins": 0.016317885369062424, "rewards/rejected": -0.060060858726501465, "sft_loss": 0.8748595118522644, "step": 1615 }, { "epoch": 1.296, "grad_norm": 9.095017819397759, "learning_rate": 3.5008725813922383e-06, "logits/chosen": -0.5774226188659668, "logits/rejected": -1.0028631687164307, "logps/chosen": -1.284346342086792, "logps/rejected": -1.4374910593032837, "loss": 0.8847, "odds_ratio_loss": 0.6303257346153259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06421732157468796, "rewards/margins": 0.007657224778085947, "rewards/rejected": -0.07187455147504807, "sft_loss": 1.284346342086792, "step": 1620 }, { "epoch": 1.3, "grad_norm": 9.33528794237972, "learning_rate": 3.4901994150978926e-06, "logits/chosen": -0.878362774848938, "logits/rejected": -0.7265375256538391, "logps/chosen": -0.8903997540473938, "logps/rejected": -1.3938519954681396, "loss": 0.8503, "odds_ratio_loss": 0.45942550897598267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04451999068260193, "rewards/margins": 0.025172609835863113, "rewards/rejected": -0.06969259679317474, "sft_loss": 0.8903997540473938, "step": 1625 }, { "epoch": 1.304, "grad_norm": 10.158405753265898, "learning_rate": 3.4795047994562463e-06, "logits/chosen": -0.8939259648323059, "logits/rejected": -0.750242292881012, "logps/chosen": -0.9898282885551453, "logps/rejected": -1.685683250427246, "loss": 0.8473, "odds_ratio_loss": 0.5104387402534485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.049491409212350845, "rewards/margins": 0.03479275852441788, "rewards/rejected": -0.08428415656089783, "sft_loss": 0.9898282885551453, "step": 1630 }, { "epoch": 1.308, "grad_norm": 5.954588970837507, "learning_rate": 3.4687889661302577e-06, "logits/chosen": -0.9046002626419067, "logits/rejected": -0.9883207082748413, "logps/chosen": -0.8091312646865845, "logps/rejected": -1.3389207124710083, "loss": 0.8038, "odds_ratio_loss": 0.5021311640739441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.040456559509038925, "rewards/margins": 0.02648947760462761, "rewards/rejected": -0.06694603711366653, "sft_loss": 0.8091312646865845, "step": 1635 }, { "epoch": 1.312, "grad_norm": 6.811003884561586, "learning_rate": 3.458052147242494e-06, "logits/chosen": -0.7402358651161194, "logits/rejected": -0.9504525065422058, "logps/chosen": -0.9438158273696899, "logps/rejected": -2.0763096809387207, "loss": 0.8385, "odds_ratio_loss": 0.28224462270736694, "rewards/accuracies": 1.0, "rewards/chosen": -0.047190792858600616, "rewards/margins": 0.05662469193339348, "rewards/rejected": -0.1038154810667038, "sft_loss": 0.9438158273696899, "step": 1640 }, { "epoch": 1.316, "grad_norm": 8.65594926906311, "learning_rate": 3.4472945753701038e-06, "logits/chosen": -0.7355834245681763, "logits/rejected": -0.7831618189811707, "logps/chosen": -0.8326643705368042, "logps/rejected": -1.457876205444336, "loss": 0.8597, "odds_ratio_loss": 0.43533915281295776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04163322225213051, "rewards/margins": 0.031260598450899124, "rewards/rejected": -0.07289381325244904, "sft_loss": 0.8326643705368042, "step": 1645 }, { "epoch": 1.32, "grad_norm": 5.787660518955911, "learning_rate": 3.436516483539781e-06, "logits/chosen": -0.6674588918685913, "logits/rejected": -1.381403923034668, "logps/chosen": -0.6492362022399902, "logps/rejected": -1.1699539422988892, "loss": 0.7273, "odds_ratio_loss": 0.35149624943733215, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03246181458234787, "rewards/margins": 0.026035884395241737, "rewards/rejected": -0.05849768966436386, "sft_loss": 0.6492362022399902, "step": 1650 }, { "epoch": 1.324, "grad_norm": 6.447720144022753, "learning_rate": 3.4257181052227133e-06, "logits/chosen": -0.76915043592453, "logits/rejected": -1.1846320629119873, "logps/chosen": -0.8949073553085327, "logps/rejected": -1.5921183824539185, "loss": 0.8674, "odds_ratio_loss": 0.47409844398498535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04474537447094917, "rewards/margins": 0.03486054390668869, "rewards/rejected": -0.07960591465234756, "sft_loss": 0.8949073553085327, "step": 1655 }, { "epoch": 1.328, "grad_norm": 16.496650507435916, "learning_rate": 3.4148996743295305e-06, "logits/chosen": -0.8497360348701477, "logits/rejected": -1.0253467559814453, "logps/chosen": -0.7608711123466492, "logps/rejected": -1.4417269229888916, "loss": 0.7469, "odds_ratio_loss": 0.33891811966896057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.038043562322854996, "rewards/margins": 0.034042783081531525, "rewards/rejected": -0.07208634912967682, "sft_loss": 0.7608711123466492, "step": 1660 }, { "epoch": 1.332, "grad_norm": 12.21871272581458, "learning_rate": 3.4040614252052305e-06, "logits/chosen": -0.7546567916870117, "logits/rejected": -1.1986757516860962, "logps/chosen": -0.6745157241821289, "logps/rejected": -1.5598911046981812, "loss": 0.7454, "odds_ratio_loss": 0.3794470429420471, "rewards/accuracies": 1.0, "rewards/chosen": -0.03372578322887421, "rewards/margins": 0.04426876828074455, "rewards/rejected": -0.07799455523490906, "sft_loss": 0.6745157241821289, "step": 1665 }, { "epoch": 1.336, "grad_norm": 5.398708828190264, "learning_rate": 3.3932035926241103e-06, "logits/chosen": -0.9119020700454712, "logits/rejected": -0.9367231130599976, "logps/chosen": -0.8278735280036926, "logps/rejected": -1.6131706237792969, "loss": 0.7558, "odds_ratio_loss": 0.4295481741428375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04139367491006851, "rewards/margins": 0.03926485404372215, "rewards/rejected": -0.08065854012966156, "sft_loss": 0.8278735280036926, "step": 1670 }, { "epoch": 1.34, "grad_norm": 7.998094668850599, "learning_rate": 3.3823264117846722e-06, "logits/chosen": -0.7095149159431458, "logits/rejected": -0.9372469186782837, "logps/chosen": -0.6504486799240112, "logps/rejected": -1.4709304571151733, "loss": 0.7999, "odds_ratio_loss": 0.3226034343242645, "rewards/accuracies": 1.0, "rewards/chosen": -0.03252243250608444, "rewards/margins": 0.041024088859558105, "rewards/rejected": -0.07354652881622314, "sft_loss": 0.6504486799240112, "step": 1675 }, { "epoch": 1.3439999999999999, "grad_norm": 13.402324415016235, "learning_rate": 3.3714301183045382e-06, "logits/chosen": -1.0852770805358887, "logits/rejected": -1.0294393301010132, "logps/chosen": -0.6565207242965698, "logps/rejected": -2.4157521724700928, "loss": 0.835, "odds_ratio_loss": 0.23401963710784912, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03282603248953819, "rewards/margins": 0.0879615768790245, "rewards/rejected": -0.1207876056432724, "sft_loss": 0.6565207242965698, "step": 1680 }, { "epoch": 1.3479999999999999, "grad_norm": 6.178693388099064, "learning_rate": 3.360514948215339e-06, "logits/chosen": -0.9066254496574402, "logits/rejected": -1.067453384399414, "logps/chosen": -0.5945879817008972, "logps/rejected": -1.1769797801971436, "loss": 0.7498, "odds_ratio_loss": 0.4167722761631012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02972939983010292, "rewards/margins": 0.029119592159986496, "rewards/rejected": -0.05884898826479912, "sft_loss": 0.5945879817008972, "step": 1685 }, { "epoch": 1.3519999999999999, "grad_norm": 8.662276656949278, "learning_rate": 3.349581137957604e-06, "logits/chosen": -0.7470632791519165, "logits/rejected": -1.1178592443466187, "logps/chosen": -0.7460489869117737, "logps/rejected": -1.3396974802017212, "loss": 0.7988, "odds_ratio_loss": 0.39526912569999695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.037302445620298386, "rewards/margins": 0.029682422056794167, "rewards/rejected": -0.0669848695397377, "sft_loss": 0.7460489869117737, "step": 1690 }, { "epoch": 1.3559999999999999, "grad_norm": 5.759001631003996, "learning_rate": 3.338628924375638e-06, "logits/chosen": -0.5028613805770874, "logits/rejected": -0.9039660692214966, "logps/chosen": -0.7187774181365967, "logps/rejected": -1.8051927089691162, "loss": 0.7071, "odds_ratio_loss": 0.32172414660453796, "rewards/accuracies": 1.0, "rewards/chosen": -0.03593887761235237, "rewards/margins": 0.05432076379656792, "rewards/rejected": -0.09025964140892029, "sft_loss": 0.7187774181365967, "step": 1695 }, { "epoch": 1.3599999999999999, "grad_norm": 10.588167123809765, "learning_rate": 3.3276585447123957e-06, "logits/chosen": -1.002184510231018, "logits/rejected": -1.065515160560608, "logps/chosen": -0.9078339338302612, "logps/rejected": -1.463555097579956, "loss": 0.829, "odds_ratio_loss": 0.39932265877723694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04539170488715172, "rewards/margins": 0.02778605744242668, "rewards/rejected": -0.0731777623295784, "sft_loss": 0.9078339338302612, "step": 1700 }, { "epoch": 1.3639999999999999, "grad_norm": 6.065572924023139, "learning_rate": 3.3166702366043364e-06, "logits/chosen": -0.9326748847961426, "logits/rejected": -1.246891736984253, "logps/chosen": -0.949301540851593, "logps/rejected": -1.7571815252304077, "loss": 0.8751, "odds_ratio_loss": 0.5060352087020874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04746507853269577, "rewards/margins": 0.040393996983766556, "rewards/rejected": -0.08785907924175262, "sft_loss": 0.949301540851593, "step": 1705 }, { "epoch": 1.3679999999999999, "grad_norm": 7.242528374663146, "learning_rate": 3.3056642380762783e-06, "logits/chosen": -0.7751462459564209, "logits/rejected": -0.972857654094696, "logps/chosen": -1.051584243774414, "logps/rejected": -1.275604486465454, "loss": 0.8721, "odds_ratio_loss": 0.57765793800354, "rewards/accuracies": 0.5, "rewards/chosen": -0.05257921293377876, "rewards/margins": 0.011201009154319763, "rewards/rejected": -0.06378022581338882, "sft_loss": 1.051584243774414, "step": 1710 }, { "epoch": 1.3719999999999999, "grad_norm": 5.790819655822902, "learning_rate": 3.294640787536245e-06, "logits/chosen": -0.8153125643730164, "logits/rejected": -1.0614861249923706, "logps/chosen": -0.6195241212844849, "logps/rejected": -1.4311720132827759, "loss": 0.8206, "odds_ratio_loss": 0.3184904158115387, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03097620978951454, "rewards/margins": 0.040582384914159775, "rewards/rejected": -0.07155859470367432, "sft_loss": 0.6195241212844849, "step": 1715 }, { "epoch": 1.376, "grad_norm": 11.980247948452748, "learning_rate": 3.2836001237702993e-06, "logits/chosen": -1.001387596130371, "logits/rejected": -1.2363550662994385, "logps/chosen": -0.6902490854263306, "logps/rejected": -1.1932241916656494, "loss": 0.8108, "odds_ratio_loss": 0.368974506855011, "rewards/accuracies": 1.0, "rewards/chosen": -0.034512460231781006, "rewards/margins": 0.025148753076791763, "rewards/rejected": -0.05966120958328247, "sft_loss": 0.6902490854263306, "step": 1720 }, { "epoch": 1.38, "grad_norm": 5.0828283091407735, "learning_rate": 3.272542485937369e-06, "logits/chosen": -1.0046998262405396, "logits/rejected": -0.8059650659561157, "logps/chosen": -0.7130595445632935, "logps/rejected": -1.1542508602142334, "loss": 0.8238, "odds_ratio_loss": 0.4597712457180023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03565298020839691, "rewards/margins": 0.02205955982208252, "rewards/rejected": -0.05771253630518913, "sft_loss": 0.7130595445632935, "step": 1725 }, { "epoch": 1.384, "grad_norm": 5.35699654066337, "learning_rate": 3.2614681135640696e-06, "logits/chosen": -0.863267719745636, "logits/rejected": -1.239708423614502, "logps/chosen": -0.877366840839386, "logps/rejected": -1.5349080562591553, "loss": 0.805, "odds_ratio_loss": 0.3669111132621765, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04386834427714348, "rewards/margins": 0.03287705034017563, "rewards/rejected": -0.07674539089202881, "sft_loss": 0.877366840839386, "step": 1730 }, { "epoch": 1.388, "grad_norm": 7.300604070779147, "learning_rate": 3.2503772465395143e-06, "logits/chosen": -0.8804348111152649, "logits/rejected": -0.6921446323394775, "logps/chosen": -0.6613166332244873, "logps/rejected": -1.4078587293624878, "loss": 0.7645, "odds_ratio_loss": 0.3147312104701996, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.033065833151340485, "rewards/margins": 0.037327107042074203, "rewards/rejected": -0.07039294391870499, "sft_loss": 0.6613166332244873, "step": 1735 }, { "epoch": 1.392, "grad_norm": 7.544399455719484, "learning_rate": 3.2392701251101172e-06, "logits/chosen": -0.749683678150177, "logits/rejected": -1.0036853551864624, "logps/chosen": -0.8362616300582886, "logps/rejected": -1.4140605926513672, "loss": 0.8505, "odds_ratio_loss": 0.3677152693271637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04181307926774025, "rewards/margins": 0.028889944776892662, "rewards/rejected": -0.07070302218198776, "sft_loss": 0.8362616300582886, "step": 1740 }, { "epoch": 1.396, "grad_norm": 5.220293895574105, "learning_rate": 3.228146989874389e-06, "logits/chosen": -0.8896152377128601, "logits/rejected": -0.9276788830757141, "logps/chosen": -0.9979864358901978, "logps/rejected": -1.3124749660491943, "loss": 0.9121, "odds_ratio_loss": 0.5718216896057129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.049899328500032425, "rewards/margins": 0.015724416822195053, "rewards/rejected": -0.06562374532222748, "sft_loss": 0.9979864358901978, "step": 1745 }, { "epoch": 1.4, "grad_norm": 8.96935863952734, "learning_rate": 3.217008081777726e-06, "logits/chosen": -0.8349775075912476, "logits/rejected": -0.8859192132949829, "logps/chosen": -0.7498109340667725, "logps/rejected": -1.4023478031158447, "loss": 0.7615, "odds_ratio_loss": 0.3978427052497864, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03749055042862892, "rewards/margins": 0.03262684494256973, "rewards/rejected": -0.07011739164590836, "sft_loss": 0.7498109340667725, "step": 1750 }, { "epoch": 1.404, "grad_norm": 4.812654034935553, "learning_rate": 3.205853642107192e-06, "logits/chosen": -0.8215142488479614, "logits/rejected": -0.8328277468681335, "logps/chosen": -0.7706052660942078, "logps/rejected": -1.4792053699493408, "loss": 0.7118, "odds_ratio_loss": 0.3464731276035309, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.038530267775058746, "rewards/margins": 0.03542999550700188, "rewards/rejected": -0.07396025955677032, "sft_loss": 0.7706052660942078, "step": 1755 }, { "epoch": 1.408, "grad_norm": 5.327583776041735, "learning_rate": 3.1946839124862873e-06, "logits/chosen": -0.8214722871780396, "logits/rejected": -0.9568880796432495, "logps/chosen": -0.8749944567680359, "logps/rejected": -1.4335380792617798, "loss": 0.8792, "odds_ratio_loss": 0.47741952538490295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04374972730875015, "rewards/margins": 0.027927175164222717, "rewards/rejected": -0.07167690247297287, "sft_loss": 0.8749944567680359, "step": 1760 }, { "epoch": 1.412, "grad_norm": 7.208526674502584, "learning_rate": 3.183499134869721e-06, "logits/chosen": -0.8680820465087891, "logits/rejected": -0.9875283241271973, "logps/chosen": -1.09853196144104, "logps/rejected": -1.329644799232483, "loss": 0.8253, "odds_ratio_loss": 0.6291381120681763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05492659658193588, "rewards/margins": 0.01155565120279789, "rewards/rejected": -0.06648223847150803, "sft_loss": 1.09853196144104, "step": 1765 }, { "epoch": 1.416, "grad_norm": 12.79259892670254, "learning_rate": 3.1722995515381644e-06, "logits/chosen": -1.3741729259490967, "logits/rejected": -0.9874537587165833, "logps/chosen": -0.6389180421829224, "logps/rejected": -1.135964035987854, "loss": 0.8255, "odds_ratio_loss": 0.34384432435035706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.031945906579494476, "rewards/margins": 0.024852296337485313, "rewards/rejected": -0.05679820105433464, "sft_loss": 0.6389180421829224, "step": 1770 }, { "epoch": 1.42, "grad_norm": 8.588985311767232, "learning_rate": 3.1610854050930063e-06, "logits/chosen": -0.8712812662124634, "logits/rejected": -1.1689540147781372, "logps/chosen": -1.0062519311904907, "logps/rejected": -1.973914384841919, "loss": 0.9165, "odds_ratio_loss": 0.35945457220077515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.050312597304582596, "rewards/margins": 0.04838312417268753, "rewards/rejected": -0.09869572520256042, "sft_loss": 1.0062519311904907, "step": 1775 }, { "epoch": 1.424, "grad_norm": 5.412987589877953, "learning_rate": 3.149856938451094e-06, "logits/chosen": -0.5333024263381958, "logits/rejected": -1.3137054443359375, "logps/chosen": -0.8601778149604797, "logps/rejected": -1.641958475112915, "loss": 0.888, "odds_ratio_loss": 0.7197853922843933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.043008893728256226, "rewards/margins": 0.039089031517505646, "rewards/rejected": -0.08209791779518127, "sft_loss": 0.8601778149604797, "step": 1780 }, { "epoch": 1.428, "grad_norm": 6.165182680586835, "learning_rate": 3.1386143948394764e-06, "logits/chosen": -1.0411813259124756, "logits/rejected": -0.48357734084129333, "logps/chosen": -0.7215684652328491, "logps/rejected": -1.2277052402496338, "loss": 0.774, "odds_ratio_loss": 0.4655603766441345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03607841581106186, "rewards/margins": 0.02530684694647789, "rewards/rejected": -0.06138526648283005, "sft_loss": 0.7215684652328491, "step": 1785 }, { "epoch": 1.432, "grad_norm": 5.856667562992677, "learning_rate": 3.127358017790132e-06, "logits/chosen": -1.1004786491394043, "logits/rejected": -0.8851817846298218, "logps/chosen": -0.5895828008651733, "logps/rejected": -1.2631773948669434, "loss": 0.7728, "odds_ratio_loss": 0.329679012298584, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.029479142278432846, "rewards/margins": 0.03367973119020462, "rewards/rejected": -0.06315887719392776, "sft_loss": 0.5895828008651733, "step": 1790 }, { "epoch": 1.436, "grad_norm": 5.146001537629413, "learning_rate": 3.116088051134695e-06, "logits/chosen": -1.0093369483947754, "logits/rejected": -1.3544859886169434, "logps/chosen": -0.7406253218650818, "logps/rejected": -1.5723696947097778, "loss": 0.8389, "odds_ratio_loss": 0.31857210397720337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03703127056360245, "rewards/margins": 0.041587214916944504, "rewards/rejected": -0.07861848175525665, "sft_loss": 0.7406253218650818, "step": 1795 }, { "epoch": 1.44, "grad_norm": 4.843927894063411, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -0.903844952583313, "logits/rejected": -0.8139872550964355, "logps/chosen": -1.0251444578170776, "logps/rejected": -1.2178661823272705, "loss": 0.9374, "odds_ratio_loss": 0.6262407302856445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05125723034143448, "rewards/margins": 0.00963608454912901, "rewards/rejected": -0.060893308371305466, "sft_loss": 1.0251444578170776, "step": 1800 }, { "epoch": 1.444, "grad_norm": 7.431201496879773, "learning_rate": 3.0935083257986493e-06, "logits/chosen": -0.6094223260879517, "logits/rejected": -0.899307370185852, "logps/chosen": -0.7733746767044067, "logps/rejected": -1.2325997352600098, "loss": 0.8234, "odds_ratio_loss": 0.44773632287979126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03866872936487198, "rewards/margins": 0.022961260750889778, "rewards/rejected": -0.06162998825311661, "sft_loss": 0.7733746767044067, "step": 1805 }, { "epoch": 1.448, "grad_norm": 11.586614332572152, "learning_rate": 3.082199056232015e-06, "logits/chosen": -0.7563143372535706, "logits/rejected": -1.069698452949524, "logps/chosen": -1.023844599723816, "logps/rejected": -1.5895153284072876, "loss": 0.7528, "odds_ratio_loss": 0.4808468222618103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.051192231476306915, "rewards/margins": 0.028283540159463882, "rewards/rejected": -0.0794757753610611, "sft_loss": 1.023844599723816, "step": 1810 }, { "epoch": 1.452, "grad_norm": 8.9164649839478, "learning_rate": 3.0708771752766397e-06, "logits/chosen": -1.1078320741653442, "logits/rejected": -1.0441960096359253, "logps/chosen": -0.8418495059013367, "logps/rejected": -1.273721694946289, "loss": 0.8566, "odds_ratio_loss": 0.4950701594352722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04209247976541519, "rewards/margins": 0.0215936116874218, "rewards/rejected": -0.06368608772754669, "sft_loss": 0.8418495059013367, "step": 1815 }, { "epoch": 1.456, "grad_norm": 5.634450444011399, "learning_rate": 3.059542928183079e-06, "logits/chosen": -1.020975112915039, "logits/rejected": -1.0432868003845215, "logps/chosen": -0.8400118947029114, "logps/rejected": -1.9318656921386719, "loss": 0.8575, "odds_ratio_loss": 0.45713549852371216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04200059548020363, "rewards/margins": 0.05459269881248474, "rewards/rejected": -0.09659329801797867, "sft_loss": 0.8400118947029114, "step": 1820 }, { "epoch": 1.46, "grad_norm": 10.1775941550842, "learning_rate": 3.0481965604697582e-06, "logits/chosen": -0.6371676325798035, "logits/rejected": -1.1217305660247803, "logps/chosen": -0.7022415995597839, "logps/rejected": -1.5772149562835693, "loss": 0.7258, "odds_ratio_loss": 0.3305204510688782, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.035112082958221436, "rewards/margins": 0.04374866932630539, "rewards/rejected": -0.07886074483394623, "sft_loss": 0.7022415995597839, "step": 1825 }, { "epoch": 1.464, "grad_norm": 6.898491609991041, "learning_rate": 3.0368383179176584e-06, "logits/chosen": -0.5923871994018555, "logits/rejected": -1.568169355392456, "logps/chosen": -0.7653383016586304, "logps/rejected": -1.7148897647857666, "loss": 0.7246, "odds_ratio_loss": 0.36337292194366455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03826691955327988, "rewards/margins": 0.04747757315635681, "rewards/rejected": -0.08574448525905609, "sft_loss": 0.7653383016586304, "step": 1830 }, { "epoch": 1.468, "grad_norm": 5.964977560470005, "learning_rate": 3.025468446564985e-06, "logits/chosen": -1.165166974067688, "logits/rejected": -0.8749963641166687, "logps/chosen": -0.8071148991584778, "logps/rejected": -1.657152533531189, "loss": 0.865, "odds_ratio_loss": 0.321114182472229, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04035574942827225, "rewards/margins": 0.04250188171863556, "rewards/rejected": -0.08285762369632721, "sft_loss": 0.8071148991584778, "step": 1835 }, { "epoch": 1.472, "grad_norm": 9.069724398190369, "learning_rate": 3.0140871927018466e-06, "logits/chosen": -1.0001251697540283, "logits/rejected": -0.8858006596565247, "logps/chosen": -0.8963924646377563, "logps/rejected": -1.671795129776001, "loss": 0.9639, "odds_ratio_loss": 0.41972383856773376, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04481962323188782, "rewards/margins": 0.03877013176679611, "rewards/rejected": -0.08358974754810333, "sft_loss": 0.8963924646377563, "step": 1840 }, { "epoch": 1.476, "grad_norm": 8.923998975352527, "learning_rate": 3.002694802864912e-06, "logits/chosen": -0.8372132182121277, "logits/rejected": -0.897018551826477, "logps/chosen": -0.6927198171615601, "logps/rejected": -1.5746911764144897, "loss": 0.7381, "odds_ratio_loss": 0.30051150918006897, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.034635990858078, "rewards/margins": 0.044098567217588425, "rewards/rejected": -0.07873455435037613, "sft_loss": 0.6927198171615601, "step": 1845 }, { "epoch": 1.48, "grad_norm": 11.770053955264393, "learning_rate": 2.9912915238320755e-06, "logits/chosen": -0.6851986646652222, "logits/rejected": -1.1784414052963257, "logps/chosen": -0.833079993724823, "logps/rejected": -1.4270697832107544, "loss": 0.8667, "odds_ratio_loss": 0.42909732460975647, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04165399819612503, "rewards/margins": 0.02969948947429657, "rewards/rejected": -0.0713534951210022, "sft_loss": 0.833079993724823, "step": 1850 }, { "epoch": 1.484, "grad_norm": 6.233141067464782, "learning_rate": 2.9798776026171087e-06, "logits/chosen": -0.842139720916748, "logits/rejected": -1.1131788492202759, "logps/chosen": -0.9360324740409851, "logps/rejected": -1.4683505296707153, "loss": 0.8339, "odds_ratio_loss": 0.5007175207138062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04680163040757179, "rewards/margins": 0.026615899056196213, "rewards/rejected": -0.07341752201318741, "sft_loss": 0.9360324740409851, "step": 1855 }, { "epoch": 1.488, "grad_norm": 6.003031745272665, "learning_rate": 2.9684532864643123e-06, "logits/chosen": -0.8800287246704102, "logits/rejected": -1.1693894863128662, "logps/chosen": -0.8431293368339539, "logps/rejected": -1.1915092468261719, "loss": 0.8388, "odds_ratio_loss": 0.5465279817581177, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04215647280216217, "rewards/margins": 0.017418991774320602, "rewards/rejected": -0.059575460851192474, "sft_loss": 0.8431293368339539, "step": 1860 }, { "epoch": 1.492, "grad_norm": 4.835110290839885, "learning_rate": 2.957018822843154e-06, "logits/chosen": -0.8184400796890259, "logits/rejected": -1.3122928142547607, "logps/chosen": -0.6195321679115295, "logps/rejected": -1.447097897529602, "loss": 0.8414, "odds_ratio_loss": 0.3124271035194397, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030976608395576477, "rewards/margins": 0.04137829318642616, "rewards/rejected": -0.07235489785671234, "sft_loss": 0.6195321679115295, "step": 1865 }, { "epoch": 1.496, "grad_norm": 6.054406343408761, "learning_rate": 2.945574459442917e-06, "logits/chosen": -0.9596401453018188, "logits/rejected": -1.1060923337936401, "logps/chosen": -0.6389337778091431, "logps/rejected": -1.4341099262237549, "loss": 0.8757, "odds_ratio_loss": 0.32751792669296265, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03194668889045715, "rewards/margins": 0.03975880891084671, "rewards/rejected": -0.07170549780130386, "sft_loss": 0.6389337778091431, "step": 1870 }, { "epoch": 1.5, "grad_norm": 5.445524941655018, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -0.647433876991272, "logits/rejected": -1.0591230392456055, "logps/chosen": -0.6938437223434448, "logps/rejected": -1.3513716459274292, "loss": 0.7662, "odds_ratio_loss": 0.5081362128257751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0346921868622303, "rewards/margins": 0.0328763946890831, "rewards/rejected": -0.0675685852766037, "sft_loss": 0.6938437223434448, "step": 1875 }, { "epoch": 1.504, "grad_norm": 5.6546570414563, "learning_rate": 2.922657025129185e-06, "logits/chosen": -0.6220548748970032, "logits/rejected": -0.8203363418579102, "logps/chosen": -0.8300703167915344, "logps/rejected": -1.3827760219573975, "loss": 0.7737, "odds_ratio_loss": 0.40472111105918884, "rewards/accuracies": 1.0, "rewards/chosen": -0.04150351509451866, "rewards/margins": 0.02763528749346733, "rewards/rejected": -0.06913881003856659, "sft_loss": 0.8300703167915344, "step": 1880 }, { "epoch": 1.508, "grad_norm": 6.230171438098299, "learning_rate": 2.9111844506449973e-06, "logits/chosen": -0.7947267293930054, "logits/rejected": -0.9040530920028687, "logps/chosen": -0.9311960935592651, "logps/rejected": -1.4013652801513672, "loss": 0.7731, "odds_ratio_loss": 0.44902342557907104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04655980318784714, "rewards/margins": 0.02350846491754055, "rewards/rejected": -0.07006827741861343, "sft_loss": 0.9311960935592651, "step": 1885 }, { "epoch": 1.512, "grad_norm": 6.7332266058939405, "learning_rate": 2.8997029692295875e-06, "logits/chosen": -0.814238429069519, "logits/rejected": -0.9660609364509583, "logps/chosen": -0.7938546538352966, "logps/rejected": -1.457929253578186, "loss": 0.9477, "odds_ratio_loss": 0.4195871949195862, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03969273716211319, "rewards/margins": 0.033203721046447754, "rewards/rejected": -0.07289645820856094, "sft_loss": 0.7938546538352966, "step": 1890 }, { "epoch": 1.516, "grad_norm": 5.752938476673707, "learning_rate": 2.888212829590719e-06, "logits/chosen": -0.6543909311294556, "logits/rejected": -0.5943178534507751, "logps/chosen": -0.5880377888679504, "logps/rejected": -1.3045456409454346, "loss": 0.8214, "odds_ratio_loss": 0.3315754234790802, "rewards/accuracies": 1.0, "rewards/chosen": -0.02940189279615879, "rewards/margins": 0.03582539036870003, "rewards/rejected": -0.06522727757692337, "sft_loss": 0.5880377888679504, "step": 1895 }, { "epoch": 1.52, "grad_norm": 4.713391058679103, "learning_rate": 2.876714280623708e-06, "logits/chosen": -0.8652218580245972, "logits/rejected": -0.9359322786331177, "logps/chosen": -0.6846807599067688, "logps/rejected": -1.5125911235809326, "loss": 0.7733, "odds_ratio_loss": 0.2784159481525421, "rewards/accuracies": 1.0, "rewards/chosen": -0.03423403576016426, "rewards/margins": 0.04139552637934685, "rewards/rejected": -0.07562955468893051, "sft_loss": 0.6846807599067688, "step": 1900 }, { "epoch": 1.524, "grad_norm": 9.06130869139701, "learning_rate": 2.8652075714060296e-06, "logits/chosen": -0.920418381690979, "logits/rejected": -0.9699466824531555, "logps/chosen": -0.49238044023513794, "logps/rejected": -1.3032811880111694, "loss": 0.7794, "odds_ratio_loss": 0.21321973204612732, "rewards/accuracies": 1.0, "rewards/chosen": -0.024619024246931076, "rewards/margins": 0.040545038878917694, "rewards/rejected": -0.06516405940055847, "sft_loss": 0.49238044023513794, "step": 1905 }, { "epoch": 1.528, "grad_norm": 6.44994215725063, "learning_rate": 2.8536929511919227e-06, "logits/chosen": -0.5823496580123901, "logits/rejected": -1.490532636642456, "logps/chosen": -0.7626686096191406, "logps/rejected": -1.4420548677444458, "loss": 0.8321, "odds_ratio_loss": 0.3727583587169647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03813342750072479, "rewards/margins": 0.03396930545568466, "rewards/rejected": -0.07210274040699005, "sft_loss": 0.7626686096191406, "step": 1910 }, { "epoch": 1.532, "grad_norm": 10.576382626683463, "learning_rate": 2.842170669406993e-06, "logits/chosen": -0.9067608714103699, "logits/rejected": -1.0328947305679321, "logps/chosen": -0.7888859510421753, "logps/rejected": -1.5981676578521729, "loss": 0.9028, "odds_ratio_loss": 0.3959670960903168, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.039444297552108765, "rewards/margins": 0.04046408459544182, "rewards/rejected": -0.07990838587284088, "sft_loss": 0.7888859510421753, "step": 1915 }, { "epoch": 1.536, "grad_norm": 10.142634875571975, "learning_rate": 2.8306409756428067e-06, "logits/chosen": -0.8129776120185852, "logits/rejected": -1.2146799564361572, "logps/chosen": -0.631020188331604, "logps/rejected": -1.7902923822402954, "loss": 0.7791, "odds_ratio_loss": 0.4324275851249695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03155101090669632, "rewards/margins": 0.05796360969543457, "rewards/rejected": -0.08951462805271149, "sft_loss": 0.631020188331604, "step": 1920 }, { "epoch": 1.54, "grad_norm": 6.955192952345903, "learning_rate": 2.8191041196514874e-06, "logits/chosen": -0.9502149820327759, "logits/rejected": -1.0196278095245361, "logps/chosen": -0.8367894291877747, "logps/rejected": -1.5817101001739502, "loss": 0.8259, "odds_ratio_loss": 0.31859907507896423, "rewards/accuracies": 1.0, "rewards/chosen": -0.04183947294950485, "rewards/margins": 0.03724603354930878, "rewards/rejected": -0.07908550649881363, "sft_loss": 0.8367894291877747, "step": 1925 }, { "epoch": 1.544, "grad_norm": 8.19557532330742, "learning_rate": 2.807560351340302e-06, "logits/chosen": -1.2878282070159912, "logits/rejected": -0.9704602956771851, "logps/chosen": -0.6887832880020142, "logps/rejected": -1.2875627279281616, "loss": 0.7609, "odds_ratio_loss": 0.40261825919151306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.034439168870449066, "rewards/margins": 0.029938969761133194, "rewards/rejected": -0.06437813490629196, "sft_loss": 0.6887832880020142, "step": 1930 }, { "epoch": 1.548, "grad_norm": 11.200304687438543, "learning_rate": 2.7960099207662535e-06, "logits/chosen": -1.1493829488754272, "logits/rejected": -1.0676811933517456, "logps/chosen": -0.7964752316474915, "logps/rejected": -1.2786619663238525, "loss": 0.8397, "odds_ratio_loss": 0.4415219724178314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.039823759347200394, "rewards/margins": 0.024109339341521263, "rewards/rejected": -0.0639331042766571, "sft_loss": 0.7964752316474915, "step": 1935 }, { "epoch": 1.552, "grad_norm": 6.369713349556111, "learning_rate": 2.7844530781306544e-06, "logits/chosen": -0.8758190870285034, "logits/rejected": -0.826004147529602, "logps/chosen": -0.7934783697128296, "logps/rejected": -1.5418189764022827, "loss": 0.7721, "odds_ratio_loss": 0.3249477446079254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03967391699552536, "rewards/margins": 0.037417031824588776, "rewards/rejected": -0.07709096372127533, "sft_loss": 0.7934783697128296, "step": 1940 }, { "epoch": 1.556, "grad_norm": 7.211892711878723, "learning_rate": 2.77289007377372e-06, "logits/chosen": -0.8543888926506042, "logits/rejected": -0.8026968240737915, "logps/chosen": -0.5843006372451782, "logps/rejected": -1.1217896938323975, "loss": 0.8268, "odds_ratio_loss": 0.388207346200943, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02921503409743309, "rewards/margins": 0.02687445841729641, "rewards/rejected": -0.05608949065208435, "sft_loss": 0.5843006372451782, "step": 1945 }, { "epoch": 1.56, "grad_norm": 6.68682091618046, "learning_rate": 2.761321158169134e-06, "logits/chosen": -1.3163114786148071, "logits/rejected": -0.9326937794685364, "logps/chosen": -0.6405404210090637, "logps/rejected": -1.8683983087539673, "loss": 0.8001, "odds_ratio_loss": 0.3494071960449219, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.032027024775743484, "rewards/margins": 0.0613928958773613, "rewards/rejected": -0.09341991692781448, "sft_loss": 0.6405404210090637, "step": 1950 }, { "epoch": 1.564, "grad_norm": 8.853441517495, "learning_rate": 2.749746581918629e-06, "logits/chosen": -0.7279099225997925, "logits/rejected": -0.7794903516769409, "logps/chosen": -0.6024842262268066, "logps/rejected": -1.2369228601455688, "loss": 0.8124, "odds_ratio_loss": 0.40761059522628784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03012421354651451, "rewards/margins": 0.031721919775009155, "rewards/rejected": -0.061846137046813965, "sft_loss": 0.6024842262268066, "step": 1955 }, { "epoch": 1.568, "grad_norm": 4.771111743627134, "learning_rate": 2.738166595746554e-06, "logits/chosen": -0.6174139976501465, "logits/rejected": -0.9298956990242004, "logps/chosen": -0.8409628868103027, "logps/rejected": -1.2658686637878418, "loss": 0.8408, "odds_ratio_loss": 0.46160784363746643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0420481413602829, "rewards/margins": 0.021245291456580162, "rewards/rejected": -0.0632934421300888, "sft_loss": 0.8409628868103027, "step": 1960 }, { "epoch": 1.572, "grad_norm": 7.568042308319601, "learning_rate": 2.726581450494451e-06, "logits/chosen": -0.5073860883712769, "logits/rejected": -0.8094170689582825, "logps/chosen": -0.6778584718704224, "logps/rejected": -1.0111925601959229, "loss": 0.7613, "odds_ratio_loss": 0.44990649819374084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.033892922103405, "rewards/margins": 0.016666702926158905, "rewards/rejected": -0.050559621304273605, "sft_loss": 0.6778584718704224, "step": 1965 }, { "epoch": 1.576, "grad_norm": 9.06697099963702, "learning_rate": 2.7149913971156105e-06, "logits/chosen": -0.8449646234512329, "logits/rejected": -1.0290577411651611, "logps/chosen": -0.7261394262313843, "logps/rejected": -1.0704646110534668, "loss": 0.9347, "odds_ratio_loss": 0.4873872697353363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.036306969821453094, "rewards/margins": 0.017216259613633156, "rewards/rejected": -0.0535232312977314, "sft_loss": 0.7261394262313843, "step": 1970 }, { "epoch": 1.58, "grad_norm": 7.716541168169069, "learning_rate": 2.703396686669646e-06, "logits/chosen": -0.43536773324012756, "logits/rejected": -1.230164647102356, "logps/chosen": -0.9158234596252441, "logps/rejected": -1.4675378799438477, "loss": 0.8256, "odds_ratio_loss": 0.4452730119228363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.045791175216436386, "rewards/margins": 0.027585718780755997, "rewards/rejected": -0.07337689399719238, "sft_loss": 0.9158234596252441, "step": 1975 }, { "epoch": 1.584, "grad_norm": 11.344687962958782, "learning_rate": 2.6917975703170466e-06, "logits/chosen": -0.7123010158538818, "logits/rejected": -1.2155797481536865, "logps/chosen": -0.6988154649734497, "logps/rejected": -1.039637565612793, "loss": 0.7954, "odds_ratio_loss": 0.4463338255882263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.034940771758556366, "rewards/margins": 0.017041107639670372, "rewards/rejected": -0.05198187753558159, "sft_loss": 0.6988154649734497, "step": 1980 }, { "epoch": 1.588, "grad_norm": 7.131197917094538, "learning_rate": 2.6801942993137435e-06, "logits/chosen": -0.5410433411598206, "logits/rejected": -1.1760857105255127, "logps/chosen": -0.8232455253601074, "logps/rejected": -1.0818697214126587, "loss": 0.8893, "odds_ratio_loss": 0.5539475679397583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04116227477788925, "rewards/margins": 0.012931210920214653, "rewards/rejected": -0.05409349128603935, "sft_loss": 0.8232455253601074, "step": 1985 }, { "epoch": 1.592, "grad_norm": 5.5964963412533555, "learning_rate": 2.668587125005663e-06, "logits/chosen": -0.8460969924926758, "logits/rejected": -1.182850956916809, "logps/chosen": -0.6967536807060242, "logps/rejected": -1.3319151401519775, "loss": 0.7849, "odds_ratio_loss": 0.41814327239990234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03483767807483673, "rewards/margins": 0.03175807744264603, "rewards/rejected": -0.06659575551748276, "sft_loss": 0.6967536807060242, "step": 1990 }, { "epoch": 1.596, "grad_norm": 5.763026931052832, "learning_rate": 2.6569762988232838e-06, "logits/chosen": -0.8818701505661011, "logits/rejected": -1.0999513864517212, "logps/chosen": -0.7708557844161987, "logps/rejected": -1.50910222530365, "loss": 0.7027, "odds_ratio_loss": 0.3464539647102356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03854278475046158, "rewards/margins": 0.03691232204437256, "rewards/rejected": -0.07545509934425354, "sft_loss": 0.7708557844161987, "step": 1995 }, { "epoch": 1.6, "grad_norm": 4.594815757134377, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -0.7889989018440247, "logits/rejected": -1.0313599109649658, "logps/chosen": -1.0150768756866455, "logps/rejected": -1.3293017148971558, "loss": 0.8486, "odds_ratio_loss": 0.5205736756324768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.050753843039274216, "rewards/margins": 0.01571125164628029, "rewards/rejected": -0.0664650946855545, "sft_loss": 1.0150768756866455, "step": 2000 }, { "epoch": 1.604, "grad_norm": 5.622181491776131, "learning_rate": 2.6337446969476234e-06, "logits/chosen": -0.6143472790718079, "logits/rejected": -0.8800684213638306, "logps/chosen": -0.70451819896698, "logps/rejected": -1.127333641052246, "loss": 0.7694, "odds_ratio_loss": 0.47300809621810913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03522591292858124, "rewards/margins": 0.021140772849321365, "rewards/rejected": -0.056366682052612305, "sft_loss": 0.70451819896698, "step": 2005 }, { "epoch": 1.608, "grad_norm": 5.4411456874419635, "learning_rate": 2.6221244244890336e-06, "logits/chosen": -0.8750918507575989, "logits/rejected": -1.0322843790054321, "logps/chosen": -1.0601109266281128, "logps/rejected": -1.424378514289856, "loss": 0.8386, "odds_ratio_loss": 0.5982974767684937, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05300554633140564, "rewards/margins": 0.018213381990790367, "rewards/rejected": -0.07121893018484116, "sft_loss": 1.0601109266281128, "step": 2010 }, { "epoch": 1.612, "grad_norm": 6.189805248319622, "learning_rate": 2.6105015066146266e-06, "logits/chosen": -1.0919370651245117, "logits/rejected": -0.9859377145767212, "logps/chosen": -0.7465149164199829, "logps/rejected": -1.1737520694732666, "loss": 0.7624, "odds_ratio_loss": 0.41314896941185, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.037325747311115265, "rewards/margins": 0.021361857652664185, "rewards/rejected": -0.05868760868906975, "sft_loss": 0.7465149164199829, "step": 2015 }, { "epoch": 1.616, "grad_norm": 9.164029729819344, "learning_rate": 2.5988761950959133e-06, "logits/chosen": -0.8967428207397461, "logits/rejected": -0.6835768818855286, "logps/chosen": -0.7614465951919556, "logps/rejected": -1.4321725368499756, "loss": 0.8553, "odds_ratio_loss": 0.4023088812828064, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03807232901453972, "rewards/margins": 0.03353629633784294, "rewards/rejected": -0.07160861790180206, "sft_loss": 0.7614465951919556, "step": 2020 }, { "epoch": 1.62, "grad_norm": 8.085049537513227, "learning_rate": 2.587248741756253e-06, "logits/chosen": -0.7910897731781006, "logits/rejected": -1.1293184757232666, "logps/chosen": -0.7620183229446411, "logps/rejected": -1.539175033569336, "loss": 0.7589, "odds_ratio_loss": 0.3549695909023285, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.038100920617580414, "rewards/margins": 0.0388578325510025, "rewards/rejected": -0.07695874571800232, "sft_loss": 0.7620183229446411, "step": 2025 }, { "epoch": 1.624, "grad_norm": 10.097540227811598, "learning_rate": 2.575619398465402e-06, "logits/chosen": -0.8627431988716125, "logits/rejected": -0.9311240315437317, "logps/chosen": -0.877997875213623, "logps/rejected": -1.0316495895385742, "loss": 0.8384, "odds_ratio_loss": 0.5710136890411377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04389990121126175, "rewards/margins": 0.007682584226131439, "rewards/rejected": -0.05158247798681259, "sft_loss": 0.877997875213623, "step": 2030 }, { "epoch": 1.6280000000000001, "grad_norm": 7.090714107623907, "learning_rate": 2.563988417134056e-06, "logits/chosen": -0.9987133145332336, "logits/rejected": -0.8781261444091797, "logps/chosen": -0.5755537748336792, "logps/rejected": -1.2042471170425415, "loss": 0.8191, "odds_ratio_loss": 0.31655603647232056, "rewards/accuracies": 1.0, "rewards/chosen": -0.02877769246697426, "rewards/margins": 0.03143466264009476, "rewards/rejected": -0.06021235138177872, "sft_loss": 0.5755537748336792, "step": 2035 }, { "epoch": 1.6320000000000001, "grad_norm": 7.424256408913974, "learning_rate": 2.5523560497083927e-06, "logits/chosen": -0.9880864024162292, "logits/rejected": -1.3933836221694946, "logps/chosen": -0.6906462907791138, "logps/rejected": -1.200685739517212, "loss": 0.8062, "odds_ratio_loss": 0.36662763357162476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03453231602907181, "rewards/margins": 0.025501970201730728, "rewards/rejected": -0.060034286230802536, "sft_loss": 0.6906462907791138, "step": 2040 }, { "epoch": 1.6360000000000001, "grad_norm": 5.662873396280538, "learning_rate": 2.5407225481646146e-06, "logits/chosen": -0.638294517993927, "logits/rejected": -0.9844516515731812, "logps/chosen": -0.8093409538269043, "logps/rejected": -1.5629360675811768, "loss": 0.8729, "odds_ratio_loss": 0.4065137505531311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.040467046201229095, "rewards/margins": 0.037679754197597504, "rewards/rejected": -0.0781468003988266, "sft_loss": 0.8093409538269043, "step": 2045 }, { "epoch": 1.6400000000000001, "grad_norm": 4.8043102192548925, "learning_rate": 2.5290881645034932e-06, "logits/chosen": -0.5350710153579712, "logits/rejected": -1.2690541744232178, "logps/chosen": -0.8825578689575195, "logps/rejected": -1.1409144401550293, "loss": 0.8776, "odds_ratio_loss": 0.7036653757095337, "rewards/accuracies": 0.5, "rewards/chosen": -0.04412789270281792, "rewards/margins": 0.01291782595217228, "rewards/rejected": -0.057045720517635345, "sft_loss": 0.8825578689575195, "step": 2050 }, { "epoch": 1.6440000000000001, "grad_norm": 5.31433548093374, "learning_rate": 2.517453150744904e-06, "logits/chosen": -0.8365262746810913, "logits/rejected": -0.594868540763855, "logps/chosen": -0.7498319745063782, "logps/rejected": -1.5040110349655151, "loss": 0.7332, "odds_ratio_loss": 0.33074039220809937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03749159723520279, "rewards/margins": 0.03770895302295685, "rewards/rejected": -0.07520055025815964, "sft_loss": 0.7498319745063782, "step": 2055 }, { "epoch": 1.6480000000000001, "grad_norm": 6.655827286565878, "learning_rate": 2.5058177589223766e-06, "logits/chosen": -0.8108224868774414, "logits/rejected": -0.8111976385116577, "logps/chosen": -0.8283792734146118, "logps/rejected": -1.2744197845458984, "loss": 0.874, "odds_ratio_loss": 0.4357910752296448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04141896218061447, "rewards/margins": 0.022302016615867615, "rewards/rejected": -0.06372098624706268, "sft_loss": 0.8283792734146118, "step": 2060 }, { "epoch": 1.6520000000000001, "grad_norm": 8.036123405257694, "learning_rate": 2.4941822410776247e-06, "logits/chosen": -1.0825451612472534, "logits/rejected": -1.170818567276001, "logps/chosen": -0.9788244366645813, "logps/rejected": -1.9613186120986938, "loss": 0.7772, "odds_ratio_loss": 0.3012261986732483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04894121736288071, "rewards/margins": 0.04912471026182175, "rewards/rejected": -0.09806593507528305, "sft_loss": 0.9788244366645813, "step": 2065 }, { "epoch": 1.6560000000000001, "grad_norm": 6.061444627487459, "learning_rate": 2.482546849255096e-06, "logits/chosen": -0.7679504752159119, "logits/rejected": -0.9109829664230347, "logps/chosen": -0.6975770592689514, "logps/rejected": -1.4365979433059692, "loss": 0.8016, "odds_ratio_loss": 0.29892677068710327, "rewards/accuracies": 1.0, "rewards/chosen": -0.03487885370850563, "rewards/margins": 0.03695103898644447, "rewards/rejected": -0.0718298926949501, "sft_loss": 0.6975770592689514, "step": 2070 }, { "epoch": 1.6600000000000001, "grad_norm": 4.509609302945273, "learning_rate": 2.470911835496508e-06, "logits/chosen": -0.9424687623977661, "logits/rejected": -1.1301240921020508, "logps/chosen": -0.6970169544219971, "logps/rejected": -1.6458070278167725, "loss": 0.8016, "odds_ratio_loss": 0.3055366277694702, "rewards/accuracies": 1.0, "rewards/chosen": -0.034850846976041794, "rewards/margins": 0.04743950814008713, "rewards/rejected": -0.08229035139083862, "sft_loss": 0.6970169544219971, "step": 2075 }, { "epoch": 1.6640000000000001, "grad_norm": 7.765550436552941, "learning_rate": 2.4592774518353858e-06, "logits/chosen": -0.8300328254699707, "logits/rejected": -1.3914577960968018, "logps/chosen": -0.7543548345565796, "logps/rejected": -1.325756311416626, "loss": 0.8444, "odds_ratio_loss": 0.37441879510879517, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03771774470806122, "rewards/margins": 0.028570080175995827, "rewards/rejected": -0.0662878230214119, "sft_loss": 0.7543548345565796, "step": 2080 }, { "epoch": 1.6680000000000001, "grad_norm": 5.908452371029711, "learning_rate": 2.447643950291608e-06, "logits/chosen": -0.6916254758834839, "logits/rejected": -1.0722490549087524, "logps/chosen": -0.8863399624824524, "logps/rejected": -1.6229625940322876, "loss": 0.7622, "odds_ratio_loss": 0.4502854347229004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04431699588894844, "rewards/margins": 0.036831121891736984, "rewards/rejected": -0.08114812523126602, "sft_loss": 0.8863399624824524, "step": 2085 }, { "epoch": 1.6720000000000002, "grad_norm": 12.568326286094635, "learning_rate": 2.436011582865945e-06, "logits/chosen": -0.764611542224884, "logits/rejected": -1.219498872756958, "logps/chosen": -0.8522571325302124, "logps/rejected": -1.5530402660369873, "loss": 0.8979, "odds_ratio_loss": 0.4483674466609955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04261285811662674, "rewards/margins": 0.035039156675338745, "rewards/rejected": -0.07765202224254608, "sft_loss": 0.8522571325302124, "step": 2090 }, { "epoch": 1.6760000000000002, "grad_norm": 5.284478090573637, "learning_rate": 2.4243806015345988e-06, "logits/chosen": -1.1326720714569092, "logits/rejected": -0.9926662445068359, "logps/chosen": -0.636461615562439, "logps/rejected": -1.5491859912872314, "loss": 0.7837, "odds_ratio_loss": 0.3587147295475006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03182308003306389, "rewards/margins": 0.04563622921705246, "rewards/rejected": -0.07745930552482605, "sft_loss": 0.636461615562439, "step": 2095 }, { "epoch": 1.6800000000000002, "grad_norm": 5.748225305959376, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -0.8698973655700684, "logits/rejected": -1.1889925003051758, "logps/chosen": -0.825188934803009, "logps/rejected": -1.2668731212615967, "loss": 0.916, "odds_ratio_loss": 0.5938987731933594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.041259441524744034, "rewards/margins": 0.022084210067987442, "rewards/rejected": -0.06334365904331207, "sft_loss": 0.825188934803009, "step": 2100 }, { "epoch": 1.6840000000000002, "grad_norm": 13.816645361412702, "learning_rate": 2.4011238049040875e-06, "logits/chosen": -0.9653146862983704, "logits/rejected": -1.273425817489624, "logps/chosen": -0.48048973083496094, "logps/rejected": -1.9898446798324585, "loss": 0.7611, "odds_ratio_loss": 0.20829610526561737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.024024488404393196, "rewards/margins": 0.07546775043010712, "rewards/rejected": -0.09949223697185516, "sft_loss": 0.48048973083496094, "step": 2105 }, { "epoch": 1.688, "grad_norm": 7.128418390551076, "learning_rate": 2.3894984933853734e-06, "logits/chosen": -1.2187275886535645, "logits/rejected": -0.9569811820983887, "logps/chosen": -0.7580611109733582, "logps/rejected": -1.5453588962554932, "loss": 0.7746, "odds_ratio_loss": 0.32491961121559143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.037903059273958206, "rewards/margins": 0.03936489298939705, "rewards/rejected": -0.07726794481277466, "sft_loss": 0.7580611109733582, "step": 2110 }, { "epoch": 1.692, "grad_norm": 9.905564606255297, "learning_rate": 2.377875575510967e-06, "logits/chosen": -0.6468038558959961, "logits/rejected": -1.1671807765960693, "logps/chosen": -0.8859814405441284, "logps/rejected": -1.915808916091919, "loss": 0.8705, "odds_ratio_loss": 0.3132167458534241, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04429907351732254, "rewards/margins": 0.05149136856198311, "rewards/rejected": -0.09579044580459595, "sft_loss": 0.8859814405441284, "step": 2115 }, { "epoch": 1.696, "grad_norm": 5.125087558911786, "learning_rate": 2.366255303052377e-06, "logits/chosen": -0.7792321443557739, "logits/rejected": -0.8570452928543091, "logps/chosen": -1.0220623016357422, "logps/rejected": -1.2880709171295166, "loss": 0.7697, "odds_ratio_loss": 0.534646213054657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05110311508178711, "rewards/margins": 0.013300428166985512, "rewards/rejected": -0.06440354138612747, "sft_loss": 1.0220623016357422, "step": 2120 }, { "epoch": 1.7, "grad_norm": 7.717361248878697, "learning_rate": 2.3546379277238107e-06, "logits/chosen": -0.8779585957527161, "logits/rejected": -1.2526636123657227, "logps/chosen": -1.0405645370483398, "logps/rejected": -1.4684456586837769, "loss": 0.8437, "odds_ratio_loss": 0.5074051022529602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05202822759747505, "rewards/margins": 0.02139405533671379, "rewards/rejected": -0.07342227548360825, "sft_loss": 1.0405645370483398, "step": 2125 }, { "epoch": 1.704, "grad_norm": 8.595275466816915, "learning_rate": 2.3430237011767166e-06, "logits/chosen": -0.7775979042053223, "logits/rejected": -1.551299810409546, "logps/chosen": -0.7821834683418274, "logps/rejected": -2.03926944732666, "loss": 0.8241, "odds_ratio_loss": 0.37603259086608887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03910917788743973, "rewards/margins": 0.06285430490970612, "rewards/rejected": -0.10196347534656525, "sft_loss": 0.7821834683418274, "step": 2130 }, { "epoch": 1.708, "grad_norm": 5.594173003610877, "learning_rate": 2.3314128749943376e-06, "logits/chosen": -1.2585442066192627, "logits/rejected": -0.8739719390869141, "logps/chosen": -1.027630090713501, "logps/rejected": -1.4903634786605835, "loss": 0.7552, "odds_ratio_loss": 0.51094651222229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05138150602579117, "rewards/margins": 0.023136669769883156, "rewards/rejected": -0.07451816648244858, "sft_loss": 1.027630090713501, "step": 2135 }, { "epoch": 1.712, "grad_norm": 6.179490163786233, "learning_rate": 2.319805700686257e-06, "logits/chosen": -0.7619789838790894, "logits/rejected": -1.402626633644104, "logps/chosen": -1.0751920938491821, "logps/rejected": -1.9567829370498657, "loss": 0.8477, "odds_ratio_loss": 0.3091539442539215, "rewards/accuracies": 1.0, "rewards/chosen": -0.053759604692459106, "rewards/margins": 0.04407954216003418, "rewards/rejected": -0.09783915430307388, "sft_loss": 1.0751920938491821, "step": 2140 }, { "epoch": 1.716, "grad_norm": 5.014263826927916, "learning_rate": 2.3082024296829538e-06, "logits/chosen": -1.2588303089141846, "logits/rejected": -0.8437775373458862, "logps/chosen": -0.7372163534164429, "logps/rejected": -1.173533320426941, "loss": 0.7309, "odds_ratio_loss": 0.4651245176792145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.036860816180706024, "rewards/margins": 0.021815849468111992, "rewards/rejected": -0.058676671236753464, "sft_loss": 0.7372163534164429, "step": 2145 }, { "epoch": 1.72, "grad_norm": 12.671731572893654, "learning_rate": 2.296603313330355e-06, "logits/chosen": -0.940277099609375, "logits/rejected": -0.9961813688278198, "logps/chosen": -0.6746289134025574, "logps/rejected": -1.4196538925170898, "loss": 0.79, "odds_ratio_loss": 0.29419276118278503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.033731453120708466, "rewards/margins": 0.037251245230436325, "rewards/rejected": -0.07098269462585449, "sft_loss": 0.6746289134025574, "step": 2150 }, { "epoch": 1.724, "grad_norm": 8.807564564413038, "learning_rate": 2.2850086028843894e-06, "logits/chosen": -0.5990036129951477, "logits/rejected": -1.1123206615447998, "logps/chosen": -0.7771926522254944, "logps/rejected": -1.3297137022018433, "loss": 0.7942, "odds_ratio_loss": 0.3976249098777771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03885963559150696, "rewards/margins": 0.027626052498817444, "rewards/rejected": -0.0664856880903244, "sft_loss": 0.7771926522254944, "step": 2155 }, { "epoch": 1.728, "grad_norm": 6.458841654862515, "learning_rate": 2.2734185495055503e-06, "logits/chosen": -1.034919261932373, "logits/rejected": -0.8573856353759766, "logps/chosen": -0.7438480257987976, "logps/rejected": -1.2708542346954346, "loss": 0.6921, "odds_ratio_loss": 0.4702334403991699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03719240427017212, "rewards/margins": 0.026350298896431923, "rewards/rejected": -0.06354270875453949, "sft_loss": 0.7438480257987976, "step": 2160 }, { "epoch": 1.732, "grad_norm": 10.704475448149418, "learning_rate": 2.2618334042534464e-06, "logits/chosen": -0.9798957705497742, "logits/rejected": -0.9897342920303345, "logps/chosen": -0.6244773864746094, "logps/rejected": -1.1638376712799072, "loss": 0.7625, "odds_ratio_loss": 0.35523417592048645, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03122386895120144, "rewards/margins": 0.026968013495206833, "rewards/rejected": -0.05819188430905342, "sft_loss": 0.6244773864746094, "step": 2165 }, { "epoch": 1.736, "grad_norm": 8.192553193640592, "learning_rate": 2.250253418081373e-06, "logits/chosen": -0.960060715675354, "logits/rejected": -0.8580840229988098, "logps/chosen": -0.7168663740158081, "logps/rejected": -1.152551531791687, "loss": 0.7857, "odds_ratio_loss": 0.495716392993927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03584332391619682, "rewards/margins": 0.021784260869026184, "rewards/rejected": -0.05762758105993271, "sft_loss": 0.7168663740158081, "step": 2170 }, { "epoch": 1.74, "grad_norm": 8.321261439994862, "learning_rate": 2.238678841830867e-06, "logits/chosen": -0.7686837911605835, "logits/rejected": -1.141378402709961, "logps/chosen": -1.0380868911743164, "logps/rejected": -1.489746332168579, "loss": 0.806, "odds_ratio_loss": 0.4990416467189789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0519043430685997, "rewards/margins": 0.022582972422242165, "rewards/rejected": -0.07448731362819672, "sft_loss": 1.0380868911743164, "step": 2175 }, { "epoch": 1.744, "grad_norm": 6.016319364654182, "learning_rate": 2.22710992622628e-06, "logits/chosen": -1.1537871360778809, "logits/rejected": -0.9235156774520874, "logps/chosen": -1.0170602798461914, "logps/rejected": -2.0361647605895996, "loss": 0.8776, "odds_ratio_loss": 0.5887846350669861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05085300654172897, "rewards/margins": 0.05095522850751877, "rewards/rejected": -0.10180824995040894, "sft_loss": 1.0170602798461914, "step": 2180 }, { "epoch": 1.748, "grad_norm": 10.90847815557463, "learning_rate": 2.2155469218693464e-06, "logits/chosen": -0.8062864542007446, "logits/rejected": -0.944413959980011, "logps/chosen": -0.8070542216300964, "logps/rejected": -1.5685523748397827, "loss": 0.873, "odds_ratio_loss": 0.5560692548751831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0403527170419693, "rewards/margins": 0.038074906915426254, "rewards/rejected": -0.07842762768268585, "sft_loss": 0.8070542216300964, "step": 2185 }, { "epoch": 1.752, "grad_norm": 12.956912471620969, "learning_rate": 2.2039900792337477e-06, "logits/chosen": -0.9245501756668091, "logits/rejected": -0.7286485433578491, "logps/chosen": -1.1378872394561768, "logps/rejected": -1.3941930532455444, "loss": 0.8626, "odds_ratio_loss": 0.5717015266418457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.056894369423389435, "rewards/margins": 0.012815283611416817, "rewards/rejected": -0.0697096511721611, "sft_loss": 1.1378872394561768, "step": 2190 }, { "epoch": 1.756, "grad_norm": 11.281398520267489, "learning_rate": 2.192439648659699e-06, "logits/chosen": -1.042218804359436, "logits/rejected": -0.6264259219169617, "logps/chosen": -1.035028338432312, "logps/rejected": -1.542610764503479, "loss": 0.8083, "odds_ratio_loss": 0.5132459998130798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05175141617655754, "rewards/margins": 0.025379126891493797, "rewards/rejected": -0.07713054120540619, "sft_loss": 1.035028338432312, "step": 2195 }, { "epoch": 1.76, "grad_norm": 7.2793892053250735, "learning_rate": 2.1808958803485134e-06, "logits/chosen": -1.051295280456543, "logits/rejected": -0.6670552492141724, "logps/chosen": -0.6154602766036987, "logps/rejected": -1.3951618671417236, "loss": 0.8288, "odds_ratio_loss": 0.2627789378166199, "rewards/accuracies": 1.0, "rewards/chosen": -0.030773013830184937, "rewards/margins": 0.038985081017017365, "rewards/rejected": -0.0697580948472023, "sft_loss": 0.6154602766036987, "step": 2200 }, { "epoch": 1.764, "grad_norm": 6.172083283993613, "learning_rate": 2.1693590243571937e-06, "logits/chosen": -0.8260824084281921, "logits/rejected": -1.0545563697814941, "logps/chosen": -0.8104951977729797, "logps/rejected": -1.3007080554962158, "loss": 0.7999, "odds_ratio_loss": 0.3759697377681732, "rewards/accuracies": 1.0, "rewards/chosen": -0.04052475839853287, "rewards/margins": 0.024510642513632774, "rewards/rejected": -0.0650353953242302, "sft_loss": 0.8104951977729797, "step": 2205 }, { "epoch": 1.768, "grad_norm": 5.796258673720241, "learning_rate": 2.157829330593008e-06, "logits/chosen": -0.5958111882209778, "logits/rejected": -1.353058099746704, "logps/chosen": -0.7925723791122437, "logps/rejected": -1.575204610824585, "loss": 0.8336, "odds_ratio_loss": 0.31617340445518494, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03962862119078636, "rewards/margins": 0.03913161903619766, "rewards/rejected": -0.07876023650169373, "sft_loss": 0.7925723791122437, "step": 2210 }, { "epoch": 1.772, "grad_norm": 5.853650964962732, "learning_rate": 2.1463070488080777e-06, "logits/chosen": -0.7320128679275513, "logits/rejected": -1.4626697301864624, "logps/chosen": -0.920146107673645, "logps/rejected": -1.587083101272583, "loss": 0.912, "odds_ratio_loss": 0.36855971813201904, "rewards/accuracies": 1.0, "rewards/chosen": -0.046007297933101654, "rewards/margins": 0.03334684669971466, "rewards/rejected": -0.07935415208339691, "sft_loss": 0.920146107673645, "step": 2215 }, { "epoch": 1.776, "grad_norm": 7.102475221339332, "learning_rate": 2.134792428593971e-06, "logits/chosen": -1.0143307447433472, "logits/rejected": -1.0723531246185303, "logps/chosen": -0.6754562854766846, "logps/rejected": -1.2191624641418457, "loss": 0.8002, "odds_ratio_loss": 0.4698086380958557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03377281501889229, "rewards/margins": 0.027185309678316116, "rewards/rejected": -0.060958124697208405, "sft_loss": 0.6754562854766846, "step": 2220 }, { "epoch": 1.78, "grad_norm": 5.132375215504824, "learning_rate": 2.1232857193762923e-06, "logits/chosen": -0.8438177108764648, "logits/rejected": -0.8187214136123657, "logps/chosen": -0.9790526628494263, "logps/rejected": -1.4144220352172852, "loss": 0.8596, "odds_ratio_loss": 0.5008438229560852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.048952627927064896, "rewards/margins": 0.021768469363451004, "rewards/rejected": -0.0707211047410965, "sft_loss": 0.9790526628494263, "step": 2225 }, { "epoch": 1.784, "grad_norm": 5.864011412934527, "learning_rate": 2.1117871704092818e-06, "logits/chosen": -0.6806105375289917, "logits/rejected": -1.0006288290023804, "logps/chosen": -0.6304813027381897, "logps/rejected": -2.105440378189087, "loss": 0.8079, "odds_ratio_loss": 0.289185106754303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.031524065881967545, "rewards/margins": 0.07374795526266098, "rewards/rejected": -0.10527201741933823, "sft_loss": 0.6304813027381897, "step": 2230 }, { "epoch": 1.788, "grad_norm": 5.2792769917747755, "learning_rate": 2.1002970307704134e-06, "logits/chosen": -1.001686930656433, "logits/rejected": -0.7307732105255127, "logps/chosen": -0.4030815660953522, "logps/rejected": -2.1336350440979004, "loss": 0.8382, "odds_ratio_loss": 0.0816396027803421, "rewards/accuracies": 1.0, "rewards/chosen": -0.020154079422354698, "rewards/margins": 0.08652767539024353, "rewards/rejected": -0.10668174922466278, "sft_loss": 0.4030815660953522, "step": 2235 }, { "epoch": 1.792, "grad_norm": 6.219063011453153, "learning_rate": 2.0888155493550027e-06, "logits/chosen": -0.6964353322982788, "logits/rejected": -0.9690049290657043, "logps/chosen": -0.9550386667251587, "logps/rejected": -1.2657750844955444, "loss": 0.9063, "odds_ratio_loss": 0.5276353359222412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.047751929610967636, "rewards/margins": 0.015536829829216003, "rewards/rejected": -0.06328876316547394, "sft_loss": 0.9550386667251587, "step": 2240 }, { "epoch": 1.796, "grad_norm": 6.217554170161715, "learning_rate": 2.0773429748708153e-06, "logits/chosen": -1.1106126308441162, "logits/rejected": -0.8946078419685364, "logps/chosen": -0.7019718289375305, "logps/rejected": -2.1581504344940186, "loss": 0.7725, "odds_ratio_loss": 0.34209513664245605, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.035098589956760406, "rewards/margins": 0.07280893623828888, "rewards/rejected": -0.10790753364562988, "sft_loss": 0.7019718289375305, "step": 2245 }, { "epoch": 1.8, "grad_norm": 6.803417111045092, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.081549048423767, "logits/rejected": -0.8075908422470093, "logps/chosen": -1.0295336246490479, "logps/rejected": -1.406858205795288, "loss": 0.8343, "odds_ratio_loss": 0.48274731636047363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05147667974233627, "rewards/margins": 0.018866227939724922, "rewards/rejected": -0.07034290581941605, "sft_loss": 1.0295336246490479, "step": 2250 }, { "epoch": 1.804, "grad_norm": 4.989031637803273, "learning_rate": 2.0544255405570843e-06, "logits/chosen": -0.9463821649551392, "logits/rejected": -1.047937035560608, "logps/chosen": -0.6406688690185547, "logps/rejected": -1.53047776222229, "loss": 0.8074, "odds_ratio_loss": 0.3679446578025818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.032033443450927734, "rewards/margins": 0.04449043795466423, "rewards/rejected": -0.07652387768030167, "sft_loss": 0.6406688690185547, "step": 2255 }, { "epoch": 1.808, "grad_norm": 6.658453228798975, "learning_rate": 2.0429811771568468e-06, "logits/chosen": -0.7510989308357239, "logits/rejected": -1.236093282699585, "logps/chosen": -0.8555262684822083, "logps/rejected": -1.224966049194336, "loss": 0.8413, "odds_ratio_loss": 0.4976680874824524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04277631267905235, "rewards/margins": 0.01847199536859989, "rewards/rejected": -0.061248309910297394, "sft_loss": 0.8555262684822083, "step": 2260 }, { "epoch": 1.812, "grad_norm": 4.401496364800286, "learning_rate": 2.031546713535688e-06, "logits/chosen": -0.9858955144882202, "logits/rejected": -0.9150092005729675, "logps/chosen": -0.7890244722366333, "logps/rejected": -1.2879811525344849, "loss": 0.7905, "odds_ratio_loss": 0.3429797291755676, "rewards/accuracies": 1.0, "rewards/chosen": -0.039451222866773605, "rewards/margins": 0.02494782581925392, "rewards/rejected": -0.06439904868602753, "sft_loss": 0.7890244722366333, "step": 2265 }, { "epoch": 1.8159999999999998, "grad_norm": 9.174834943476265, "learning_rate": 2.0201223973828917e-06, "logits/chosen": -0.8151395916938782, "logits/rejected": -1.2962208986282349, "logps/chosen": -0.7152162790298462, "logps/rejected": -1.581583857536316, "loss": 0.7903, "odds_ratio_loss": 0.385868102312088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03576081246137619, "rewards/margins": 0.04331838712096214, "rewards/rejected": -0.07907919585704803, "sft_loss": 0.7152162790298462, "step": 2270 }, { "epoch": 1.8199999999999998, "grad_norm": 6.3991437024550475, "learning_rate": 2.0087084761679245e-06, "logits/chosen": -0.8196107149124146, "logits/rejected": -1.1247520446777344, "logps/chosen": -0.8312146067619324, "logps/rejected": -1.3900336027145386, "loss": 0.7936, "odds_ratio_loss": 0.44879013299942017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04156073182821274, "rewards/margins": 0.027940943837165833, "rewards/rejected": -0.06950168311595917, "sft_loss": 0.8312146067619324, "step": 2275 }, { "epoch": 1.8239999999999998, "grad_norm": 7.113083017693733, "learning_rate": 1.997305197135089e-06, "logits/chosen": -0.6806960701942444, "logits/rejected": -0.8038619160652161, "logps/chosen": -0.6782374382019043, "logps/rejected": -1.2599354982376099, "loss": 0.8263, "odds_ratio_loss": 0.379936158657074, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.033911872655153275, "rewards/margins": 0.02908490039408207, "rewards/rejected": -0.0629967749118805, "sft_loss": 0.6782374382019043, "step": 2280 }, { "epoch": 1.8279999999999998, "grad_norm": 7.6235951885998015, "learning_rate": 1.985912807298154e-06, "logits/chosen": -1.0690265893936157, "logits/rejected": -0.9779754877090454, "logps/chosen": -0.7425954937934875, "logps/rejected": -1.426913857460022, "loss": 0.7348, "odds_ratio_loss": 0.357009619474411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03712977468967438, "rewards/margins": 0.03421591967344284, "rewards/rejected": -0.07134570181369781, "sft_loss": 0.7425954937934875, "step": 2285 }, { "epoch": 1.8319999999999999, "grad_norm": 5.615671437369734, "learning_rate": 1.9745315534350157e-06, "logits/chosen": -0.9129589200019836, "logits/rejected": -0.6167286038398743, "logps/chosen": -1.065497636795044, "logps/rejected": -1.2868871688842773, "loss": 0.8386, "odds_ratio_loss": 0.6246089339256287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05327488109469414, "rewards/margins": 0.011069480329751968, "rewards/rejected": -0.0643443614244461, "sft_loss": 1.065497636795044, "step": 2290 }, { "epoch": 1.8359999999999999, "grad_norm": 6.084128369072109, "learning_rate": 1.963161682082342e-06, "logits/chosen": -0.9707215428352356, "logits/rejected": -0.7441933751106262, "logps/chosen": -0.9074664115905762, "logps/rejected": -1.3849105834960938, "loss": 0.8658, "odds_ratio_loss": 0.4366206228733063, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04537331685423851, "rewards/margins": 0.023872217163443565, "rewards/rejected": -0.06924553215503693, "sft_loss": 0.9074664115905762, "step": 2295 }, { "epoch": 1.8399999999999999, "grad_norm": 7.352748854504632, "learning_rate": 1.9518034395302413e-06, "logits/chosen": -0.7245827317237854, "logits/rejected": -1.2959226369857788, "logps/chosen": -0.7377294301986694, "logps/rejected": -1.4373664855957031, "loss": 0.9049, "odds_ratio_loss": 0.33754152059555054, "rewards/accuracies": 1.0, "rewards/chosen": -0.03688646852970123, "rewards/margins": 0.034981850534677505, "rewards/rejected": -0.07186831533908844, "sft_loss": 0.7377294301986694, "step": 2300 }, { "epoch": 1.8439999999999999, "grad_norm": 4.828613164349784, "learning_rate": 1.940457071816922e-06, "logits/chosen": -0.7781765460968018, "logits/rejected": -1.140925407409668, "logps/chosen": -0.775480329990387, "logps/rejected": -1.096097707748413, "loss": 0.8761, "odds_ratio_loss": 0.48161450028419495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03877401351928711, "rewards/margins": 0.016030868515372276, "rewards/rejected": -0.054804880172014236, "sft_loss": 0.775480329990387, "step": 2305 }, { "epoch": 1.8479999999999999, "grad_norm": 6.702848454346183, "learning_rate": 1.9291228247233607e-06, "logits/chosen": -0.786065936088562, "logits/rejected": -1.1700165271759033, "logps/chosen": -0.8791561126708984, "logps/rejected": -1.355006217956543, "loss": 0.7591, "odds_ratio_loss": 0.46694913506507874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04395780712366104, "rewards/margins": 0.023792508989572525, "rewards/rejected": -0.06775031983852386, "sft_loss": 0.8791561126708984, "step": 2310 }, { "epoch": 1.8519999999999999, "grad_norm": 7.711938205346599, "learning_rate": 1.9178009437679855e-06, "logits/chosen": -1.0333006381988525, "logits/rejected": -0.5370423197746277, "logps/chosen": -0.3634825050830841, "logps/rejected": -1.1746513843536377, "loss": 0.7101, "odds_ratio_loss": 0.2676122784614563, "rewards/accuracies": 1.0, "rewards/chosen": -0.018174124881625175, "rewards/margins": 0.04055844619870186, "rewards/rejected": -0.05873257666826248, "sft_loss": 0.3634825050830841, "step": 2315 }, { "epoch": 1.8559999999999999, "grad_norm": 8.761311945720486, "learning_rate": 1.9064916742013515e-06, "logits/chosen": -0.33012256026268005, "logits/rejected": -1.075441598892212, "logps/chosen": -0.9848030209541321, "logps/rejected": -1.3406559228897095, "loss": 0.8125, "odds_ratio_loss": 0.5339032411575317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04924015700817108, "rewards/margins": 0.01779264211654663, "rewards/rejected": -0.06703279912471771, "sft_loss": 0.9848030209541321, "step": 2320 }, { "epoch": 1.8599999999999999, "grad_norm": 8.996579586998216, "learning_rate": 1.895195261000831e-06, "logits/chosen": -1.030765414237976, "logits/rejected": -0.9685811996459961, "logps/chosen": -0.8394074440002441, "logps/rejected": -1.4255101680755615, "loss": 0.7645, "odds_ratio_loss": 0.4348466992378235, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04197037220001221, "rewards/margins": 0.029305141419172287, "rewards/rejected": -0.07127551734447479, "sft_loss": 0.8394074440002441, "step": 2325 }, { "epoch": 1.8639999999999999, "grad_norm": 7.188662135547713, "learning_rate": 1.883911948865306e-06, "logits/chosen": -0.7353323698043823, "logits/rejected": -0.9462113380432129, "logps/chosen": -0.6895793676376343, "logps/rejected": -1.388048529624939, "loss": 0.7353, "odds_ratio_loss": 0.40559130907058716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03447896987199783, "rewards/margins": 0.034923456609249115, "rewards/rejected": -0.06940243393182755, "sft_loss": 0.6895793676376343, "step": 2330 }, { "epoch": 1.8679999999999999, "grad_norm": 7.322809013987031, "learning_rate": 1.872641982209868e-06, "logits/chosen": -0.9933965802192688, "logits/rejected": -0.6932544112205505, "logps/chosen": -0.6619799137115479, "logps/rejected": -1.2481772899627686, "loss": 0.7704, "odds_ratio_loss": 0.38309139013290405, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.033098991960287094, "rewards/margins": 0.029309872537851334, "rewards/rejected": -0.06240885704755783, "sft_loss": 0.6619799137115479, "step": 2335 }, { "epoch": 1.8719999999999999, "grad_norm": 4.882755527025201, "learning_rate": 1.8613856051605242e-06, "logits/chosen": -0.7146080732345581, "logits/rejected": -1.3212826251983643, "logps/chosen": -0.970722496509552, "logps/rejected": -1.4650259017944336, "loss": 0.7754, "odds_ratio_loss": 0.4080827236175537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04853612929582596, "rewards/margins": 0.02471516653895378, "rewards/rejected": -0.07325129956007004, "sft_loss": 0.970722496509552, "step": 2340 }, { "epoch": 1.876, "grad_norm": 6.023565399066244, "learning_rate": 1.850143061548907e-06, "logits/chosen": -0.8632639050483704, "logits/rejected": -1.1179392337799072, "logps/chosen": -0.7041983008384705, "logps/rejected": -2.4500975608825684, "loss": 0.815, "odds_ratio_loss": 0.27313196659088135, "rewards/accuracies": 1.0, "rewards/chosen": -0.03520992025732994, "rewards/margins": 0.08729497343301773, "rewards/rejected": -0.12250488996505737, "sft_loss": 0.7041983008384705, "step": 2345 }, { "epoch": 1.88, "grad_norm": 7.758596635756381, "learning_rate": 1.8389145949069953e-06, "logits/chosen": -1.2442716360092163, "logits/rejected": -0.7619195580482483, "logps/chosen": -0.8322132229804993, "logps/rejected": -1.4952902793884277, "loss": 0.9443, "odds_ratio_loss": 0.39093518257141113, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.041610658168792725, "rewards/margins": 0.03315385431051254, "rewards/rejected": -0.07476451992988586, "sft_loss": 0.8322132229804993, "step": 2350 }, { "epoch": 1.884, "grad_norm": 6.603546187128215, "learning_rate": 1.827700448461836e-06, "logits/chosen": -0.8992365598678589, "logits/rejected": -0.41766390204429626, "logps/chosen": -0.7582792043685913, "logps/rejected": -1.665892243385315, "loss": 0.8475, "odds_ratio_loss": 0.422523558139801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0379139669239521, "rewards/margins": 0.04538065940141678, "rewards/rejected": -0.08329462260007858, "sft_loss": 0.7582792043685913, "step": 2355 }, { "epoch": 1.888, "grad_norm": 4.094000393336399, "learning_rate": 1.816500865130279e-06, "logits/chosen": -1.0214028358459473, "logits/rejected": -0.9226012229919434, "logps/chosen": -0.78780597448349, "logps/rejected": -1.5264439582824707, "loss": 0.7332, "odds_ratio_loss": 0.39362823963165283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03939030319452286, "rewards/margins": 0.036931902170181274, "rewards/rejected": -0.07632219791412354, "sft_loss": 0.78780597448349, "step": 2360 }, { "epoch": 1.892, "grad_norm": 5.145690592897293, "learning_rate": 1.8053160875137137e-06, "logits/chosen": -0.9314631223678589, "logits/rejected": -0.9437487721443176, "logps/chosen": -1.0313526391983032, "logps/rejected": -1.4607595205307007, "loss": 0.8259, "odds_ratio_loss": 0.5963125228881836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05156763270497322, "rewards/margins": 0.021470338106155396, "rewards/rejected": -0.07303796708583832, "sft_loss": 1.0313526391983032, "step": 2365 }, { "epoch": 1.896, "grad_norm": 7.417881702928459, "learning_rate": 1.7941463578928088e-06, "logits/chosen": -0.797860324382782, "logits/rejected": -1.1919324398040771, "logps/chosen": -0.879246711730957, "logps/rejected": -1.3754128217697144, "loss": 0.8715, "odds_ratio_loss": 0.4276247024536133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04396234080195427, "rewards/margins": 0.024808308109641075, "rewards/rejected": -0.0687706395983696, "sft_loss": 0.879246711730957, "step": 2370 }, { "epoch": 1.9, "grad_norm": 7.132781683007218, "learning_rate": 1.7829919182222752e-06, "logits/chosen": -0.972023606300354, "logits/rejected": -1.1046335697174072, "logps/chosen": -0.8717896342277527, "logps/rejected": -1.0692293643951416, "loss": 0.8009, "odds_ratio_loss": 0.679745078086853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.043589480221271515, "rewards/margins": 0.009871991351246834, "rewards/rejected": -0.0534614734351635, "sft_loss": 0.8717896342277527, "step": 2375 }, { "epoch": 1.904, "grad_norm": 4.092969776254415, "learning_rate": 1.7718530101256115e-06, "logits/chosen": -1.1578160524368286, "logits/rejected": -1.0545146465301514, "logps/chosen": -0.47031641006469727, "logps/rejected": -1.8584105968475342, "loss": 0.7973, "odds_ratio_loss": 0.1941300332546234, "rewards/accuracies": 1.0, "rewards/chosen": -0.023515818640589714, "rewards/margins": 0.0694047138094902, "rewards/rejected": -0.09292052686214447, "sft_loss": 0.47031641006469727, "step": 2380 }, { "epoch": 1.908, "grad_norm": 6.116647440253481, "learning_rate": 1.7607298748898844e-06, "logits/chosen": -1.042887568473816, "logits/rejected": -0.9683266878128052, "logps/chosen": -0.7862476706504822, "logps/rejected": -1.1969187259674072, "loss": 0.8179, "odds_ratio_loss": 0.5028802752494812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03931238502264023, "rewards/margins": 0.020533554255962372, "rewards/rejected": -0.0598459430038929, "sft_loss": 0.7862476706504822, "step": 2385 }, { "epoch": 1.912, "grad_norm": 5.228060511447272, "learning_rate": 1.7496227534604859e-06, "logits/chosen": -0.8977655172348022, "logits/rejected": -1.0492327213287354, "logps/chosen": -0.6170272827148438, "logps/rejected": -1.3571946620941162, "loss": 0.7784, "odds_ratio_loss": 0.2826383411884308, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030851364135742188, "rewards/margins": 0.0370083749294281, "rewards/rejected": -0.06785973161458969, "sft_loss": 0.6170272827148438, "step": 2390 }, { "epoch": 1.916, "grad_norm": 10.090456032598201, "learning_rate": 1.7385318864359304e-06, "logits/chosen": -0.9312974214553833, "logits/rejected": -1.235394835472107, "logps/chosen": -0.9995916485786438, "logps/rejected": -1.703447937965393, "loss": 0.8131, "odds_ratio_loss": 0.43445831537246704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04997957870364189, "rewards/margins": 0.0351928174495697, "rewards/rejected": -0.08517240732908249, "sft_loss": 0.9995916485786438, "step": 2395 }, { "epoch": 1.92, "grad_norm": 5.661494882202969, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -1.0951160192489624, "logits/rejected": -0.7472076416015625, "logps/chosen": -0.819926917552948, "logps/rejected": -1.4353911876678467, "loss": 0.8619, "odds_ratio_loss": 0.42657119035720825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04099633917212486, "rewards/margins": 0.03077322244644165, "rewards/rejected": -0.07176957279443741, "sft_loss": 0.819926917552948, "step": 2400 }, { "epoch": 1.924, "grad_norm": 6.119264571326813, "learning_rate": 1.7163998762297013e-06, "logits/chosen": -1.046919584274292, "logits/rejected": -1.0116114616394043, "logps/chosen": -0.6755749583244324, "logps/rejected": -1.557366967201233, "loss": 0.8711, "odds_ratio_loss": 0.2980736196041107, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03377874568104744, "rewards/margins": 0.04408959671854973, "rewards/rejected": -0.07786835730075836, "sft_loss": 0.6755749583244324, "step": 2405 }, { "epoch": 1.928, "grad_norm": 7.45326513403781, "learning_rate": 1.7053592124637557e-06, "logits/chosen": -0.9731330871582031, "logits/rejected": -1.1434471607208252, "logps/chosen": -0.6788956522941589, "logps/rejected": -1.4451775550842285, "loss": 0.8022, "odds_ratio_loss": 0.31722068786621094, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03394477814435959, "rewards/margins": 0.0383140966296196, "rewards/rejected": -0.07225887477397919, "sft_loss": 0.6788956522941589, "step": 2410 }, { "epoch": 1.932, "grad_norm": 5.554908477058027, "learning_rate": 1.6943357619237227e-06, "logits/chosen": -0.9072321653366089, "logits/rejected": -1.343854546546936, "logps/chosen": -0.7743789553642273, "logps/rejected": -1.7294988632202148, "loss": 0.7934, "odds_ratio_loss": 0.27553829550743103, "rewards/accuracies": 1.0, "rewards/chosen": -0.03871894255280495, "rewards/margins": 0.047755997627973557, "rewards/rejected": -0.0864749401807785, "sft_loss": 0.7743789553642273, "step": 2415 }, { "epoch": 1.936, "grad_norm": 9.389343814394115, "learning_rate": 1.6833297633956647e-06, "logits/chosen": -1.0711301565170288, "logits/rejected": -0.810192883014679, "logps/chosen": -0.763390839099884, "logps/rejected": -1.7045867443084717, "loss": 0.8294, "odds_ratio_loss": 0.2711796462535858, "rewards/accuracies": 1.0, "rewards/chosen": -0.03816954419016838, "rewards/margins": 0.0470597967505455, "rewards/rejected": -0.08522933721542358, "sft_loss": 0.763390839099884, "step": 2420 }, { "epoch": 1.94, "grad_norm": 6.732029543770066, "learning_rate": 1.6723414552876052e-06, "logits/chosen": -0.570145845413208, "logits/rejected": -1.3672704696655273, "logps/chosen": -1.0161564350128174, "logps/rejected": -1.465152621269226, "loss": 0.852, "odds_ratio_loss": 0.5028510689735413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05080782249569893, "rewards/margins": 0.022449806332588196, "rewards/rejected": -0.07325764000415802, "sft_loss": 1.0161564350128174, "step": 2425 }, { "epoch": 1.944, "grad_norm": 6.884753284401447, "learning_rate": 1.661371075624363e-06, "logits/chosen": -0.8588671684265137, "logits/rejected": -1.0025584697723389, "logps/chosen": -0.7952778935432434, "logps/rejected": -1.3232905864715576, "loss": 0.8421, "odds_ratio_loss": 0.44380640983581543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03976389393210411, "rewards/margins": 0.02640063688158989, "rewards/rejected": -0.066164530813694, "sft_loss": 0.7952778935432434, "step": 2430 }, { "epoch": 1.948, "grad_norm": 10.032821854639238, "learning_rate": 1.6504188620423977e-06, "logits/chosen": -0.8894034624099731, "logits/rejected": -1.096508502960205, "logps/chosen": -0.9423715472221375, "logps/rejected": -1.5046286582946777, "loss": 0.83, "odds_ratio_loss": 0.4647153317928314, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04711857810616493, "rewards/margins": 0.028112854808568954, "rewards/rejected": -0.07523143291473389, "sft_loss": 0.9423715472221375, "step": 2435 }, { "epoch": 1.952, "grad_norm": 8.207321665297997, "learning_rate": 1.6394850517846621e-06, "logits/chosen": -0.9905040860176086, "logits/rejected": -0.9356651306152344, "logps/chosen": -0.8063279986381531, "logps/rejected": -1.4606654644012451, "loss": 0.7861, "odds_ratio_loss": 0.4356115460395813, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.040316395461559296, "rewards/margins": 0.03271687030792236, "rewards/rejected": -0.07303327322006226, "sft_loss": 0.8063279986381531, "step": 2440 }, { "epoch": 1.956, "grad_norm": 6.452799460000441, "learning_rate": 1.6285698816954626e-06, "logits/chosen": -0.7353092432022095, "logits/rejected": -1.1551518440246582, "logps/chosen": -1.0718172788619995, "logps/rejected": -1.5583527088165283, "loss": 0.8719, "odds_ratio_loss": 0.46111541986465454, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05359087139368057, "rewards/margins": 0.024326767772436142, "rewards/rejected": -0.07791763544082642, "sft_loss": 1.0718172788619995, "step": 2445 }, { "epoch": 1.96, "grad_norm": 5.332170706332277, "learning_rate": 1.6176735882153284e-06, "logits/chosen": -0.9285340309143066, "logits/rejected": -1.087899923324585, "logps/chosen": -0.8756929636001587, "logps/rejected": -1.3418347835540771, "loss": 0.781, "odds_ratio_loss": 0.5048962831497192, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.043784648180007935, "rewards/margins": 0.023307088762521744, "rewards/rejected": -0.06709174066781998, "sft_loss": 0.8756929636001587, "step": 2450 }, { "epoch": 1.964, "grad_norm": 6.2107913026430746, "learning_rate": 1.6067964073758901e-06, "logits/chosen": -0.6262539625167847, "logits/rejected": -0.9137803316116333, "logps/chosen": -0.7144273519515991, "logps/rejected": -1.7826179265975952, "loss": 0.7463, "odds_ratio_loss": 0.21934516727924347, "rewards/accuracies": 1.0, "rewards/chosen": -0.035721369087696075, "rewards/margins": 0.05340953543782234, "rewards/rejected": -0.08913090080022812, "sft_loss": 0.7144273519515991, "step": 2455 }, { "epoch": 1.968, "grad_norm": 6.679465106655453, "learning_rate": 1.5959385747947697e-06, "logits/chosen": -0.6637876629829407, "logits/rejected": -1.1082921028137207, "logps/chosen": -0.7338643074035645, "logps/rejected": -1.9011523723602295, "loss": 0.8968, "odds_ratio_loss": 0.26652342081069946, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03669321909546852, "rewards/margins": 0.05836440995335579, "rewards/rejected": -0.09505762159824371, "sft_loss": 0.7338643074035645, "step": 2460 }, { "epoch": 1.972, "grad_norm": 13.019725966030125, "learning_rate": 1.5851003256704697e-06, "logits/chosen": -0.9775069355964661, "logits/rejected": -1.155165195465088, "logps/chosen": -0.771982729434967, "logps/rejected": -1.2554118633270264, "loss": 0.8792, "odds_ratio_loss": 0.5085214376449585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03859913349151611, "rewards/margins": 0.024171466007828712, "rewards/rejected": -0.06277060508728027, "sft_loss": 0.771982729434967, "step": 2465 }, { "epoch": 1.976, "grad_norm": 6.415175624114794, "learning_rate": 1.5742818947772875e-06, "logits/chosen": -0.5195826888084412, "logits/rejected": -0.9917033910751343, "logps/chosen": -0.6547525525093079, "logps/rejected": -1.6238784790039062, "loss": 0.7833, "odds_ratio_loss": 0.3622708320617676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03273762762546539, "rewards/margins": 0.04845630005002022, "rewards/rejected": -0.08119393140077591, "sft_loss": 0.6547525525093079, "step": 2470 }, { "epoch": 1.98, "grad_norm": 16.881628339846156, "learning_rate": 1.56348351646022e-06, "logits/chosen": -1.1250643730163574, "logits/rejected": -1.0977294445037842, "logps/chosen": -0.7314361333847046, "logps/rejected": -1.5915559530258179, "loss": 0.8277, "odds_ratio_loss": 0.3246075510978699, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03657180815935135, "rewards/margins": 0.043005991727113724, "rewards/rejected": -0.07957780361175537, "sft_loss": 0.7314361333847046, "step": 2475 }, { "epoch": 1.984, "grad_norm": 5.6365603948727, "learning_rate": 1.552705424629898e-06, "logits/chosen": -0.6629668474197388, "logits/rejected": -1.1308306455612183, "logps/chosen": -0.657507061958313, "logps/rejected": -1.4453434944152832, "loss": 0.8399, "odds_ratio_loss": 0.3202642798423767, "rewards/accuracies": 1.0, "rewards/chosen": -0.03287535160779953, "rewards/margins": 0.03939182311296463, "rewards/rejected": -0.07226717472076416, "sft_loss": 0.657507061958313, "step": 2480 }, { "epoch": 1.988, "grad_norm": 11.934551215608066, "learning_rate": 1.5419478527575068e-06, "logits/chosen": -1.0436184406280518, "logits/rejected": -0.9820472002029419, "logps/chosen": -1.0538418292999268, "logps/rejected": -1.5438861846923828, "loss": 0.763, "odds_ratio_loss": 0.44970980286598206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05269209295511246, "rewards/margins": 0.024502214044332504, "rewards/rejected": -0.07719431817531586, "sft_loss": 1.0538418292999268, "step": 2485 }, { "epoch": 1.992, "grad_norm": 4.730272496249411, "learning_rate": 1.5312110338697427e-06, "logits/chosen": -0.7060618996620178, "logits/rejected": -1.1768749952316284, "logps/chosen": -0.6467851400375366, "logps/rejected": -1.8637425899505615, "loss": 0.6756, "odds_ratio_loss": 0.24719473719596863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03233925253152847, "rewards/margins": 0.06084787845611572, "rewards/rejected": -0.09318713843822479, "sft_loss": 0.6467851400375366, "step": 2490 }, { "epoch": 1.996, "grad_norm": 5.17945710231851, "learning_rate": 1.520495200543754e-06, "logits/chosen": -0.6287657022476196, "logits/rejected": -1.2824218273162842, "logps/chosen": -0.8325425982475281, "logps/rejected": -1.3392772674560547, "loss": 0.7908, "odds_ratio_loss": 0.5226107835769653, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.041627127677202225, "rewards/margins": 0.02533673867583275, "rewards/rejected": -0.06696386635303497, "sft_loss": 0.8325425982475281, "step": 2495 }, { "epoch": 2.0, "grad_norm": 7.185045771765787, "learning_rate": 1.509800584902108e-06, "logits/chosen": -0.7127388119697571, "logits/rejected": -0.9128797650337219, "logps/chosen": -0.9189082980155945, "logps/rejected": -1.3660955429077148, "loss": 0.7301, "odds_ratio_loss": 0.41850653290748596, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.045945413410663605, "rewards/margins": 0.022359367460012436, "rewards/rejected": -0.06830477714538574, "sft_loss": 0.9189082980155945, "step": 2500 }, { "epoch": 2.004, "grad_norm": 5.779855004519885, "learning_rate": 1.4991274186077632e-06, "logits/chosen": -0.6605402827262878, "logits/rejected": -0.8482527732849121, "logps/chosen": -0.4051028788089752, "logps/rejected": -1.4240385293960571, "loss": 0.4714, "odds_ratio_loss": 0.22203238308429718, "rewards/accuracies": 1.0, "rewards/chosen": -0.02025514282286167, "rewards/margins": 0.05094677209854126, "rewards/rejected": -0.07120192050933838, "sft_loss": 0.4051028788089752, "step": 2505 }, { "epoch": 2.008, "grad_norm": 5.911255709626764, "learning_rate": 1.4884759328590476e-06, "logits/chosen": -0.8963392376899719, "logits/rejected": -1.2612377405166626, "logps/chosen": -0.5547269582748413, "logps/rejected": -1.2793185710906982, "loss": 0.5443, "odds_ratio_loss": 0.2325483113527298, "rewards/accuracies": 1.0, "rewards/chosen": -0.027736347168684006, "rewards/margins": 0.03622957691550255, "rewards/rejected": -0.06396592408418655, "sft_loss": 0.5547269582748413, "step": 2510 }, { "epoch": 2.012, "grad_norm": 4.263744768867711, "learning_rate": 1.4778463583846553e-06, "logits/chosen": -0.6352235078811646, "logits/rejected": -1.1501035690307617, "logps/chosen": -0.6144863963127136, "logps/rejected": -1.6748031377792358, "loss": 0.5734, "odds_ratio_loss": 0.24970802664756775, "rewards/accuracies": 1.0, "rewards/chosen": -0.03072432242333889, "rewards/margins": 0.05301583930850029, "rewards/rejected": -0.08374015986919403, "sft_loss": 0.6144863963127136, "step": 2515 }, { "epoch": 2.016, "grad_norm": 4.168544749574423, "learning_rate": 1.467238925438646e-06, "logits/chosen": -0.6912413835525513, "logits/rejected": -0.9819846153259277, "logps/chosen": -0.5293835401535034, "logps/rejected": -1.6525404453277588, "loss": 0.5187, "odds_ratio_loss": 0.2445772886276245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02646917663514614, "rewards/margins": 0.05615784972906113, "rewards/rejected": -0.08262702077627182, "sft_loss": 0.5293835401535034, "step": 2520 }, { "epoch": 2.02, "grad_norm": 6.164215653537583, "learning_rate": 1.4566538637954556e-06, "logits/chosen": -1.3008034229278564, "logits/rejected": -0.7624127864837646, "logps/chosen": -0.40782418847084045, "logps/rejected": -1.5719823837280273, "loss": 0.5314, "odds_ratio_loss": 0.13072045147418976, "rewards/accuracies": 1.0, "rewards/chosen": -0.020391209051012993, "rewards/margins": 0.058207906782627106, "rewards/rejected": -0.07859911769628525, "sft_loss": 0.40782418847084045, "step": 2525 }, { "epoch": 2.024, "grad_norm": 4.4630247981790525, "learning_rate": 1.446091402744923e-06, "logits/chosen": -0.9515730738639832, "logits/rejected": -1.204268217086792, "logps/chosen": -0.748781144618988, "logps/rejected": -1.6079699993133545, "loss": 0.5591, "odds_ratio_loss": 0.27362948656082153, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03743905574083328, "rewards/margins": 0.04295944422483444, "rewards/rejected": -0.08039849996566772, "sft_loss": 0.748781144618988, "step": 2530 }, { "epoch": 2.028, "grad_norm": 5.834985541054327, "learning_rate": 1.4355517710873184e-06, "logits/chosen": -0.761924147605896, "logits/rejected": -1.0106163024902344, "logps/chosen": -0.6143468618392944, "logps/rejected": -1.4826791286468506, "loss": 0.5242, "odds_ratio_loss": 0.3327800929546356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03071734681725502, "rewards/margins": 0.043416619300842285, "rewards/rejected": -0.074133962392807, "sft_loss": 0.6143468618392944, "step": 2535 }, { "epoch": 2.032, "grad_norm": 4.93330991376578, "learning_rate": 1.4250351971283937e-06, "logits/chosen": -0.7397106885910034, "logits/rejected": -0.974991500377655, "logps/chosen": -0.4890953600406647, "logps/rejected": -1.5005598068237305, "loss": 0.5051, "odds_ratio_loss": 0.21572282910346985, "rewards/accuracies": 1.0, "rewards/chosen": -0.024454768747091293, "rewards/margins": 0.05057322978973389, "rewards/rejected": -0.07502799481153488, "sft_loss": 0.4890953600406647, "step": 2540 }, { "epoch": 2.036, "grad_norm": 5.652910564521034, "learning_rate": 1.41454190867443e-06, "logits/chosen": -0.6541872024536133, "logits/rejected": -1.2734535932540894, "logps/chosen": -0.5036207437515259, "logps/rejected": -1.882581353187561, "loss": 0.4315, "odds_ratio_loss": 0.25754693150520325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.025181034579873085, "rewards/margins": 0.06894804537296295, "rewards/rejected": -0.09412907809019089, "sft_loss": 0.5036207437515259, "step": 2545 }, { "epoch": 2.04, "grad_norm": 5.023733060512668, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -1.1533195972442627, "logits/rejected": -0.9162420034408569, "logps/chosen": -0.3805043697357178, "logps/rejected": -1.3916971683502197, "loss": 0.5004, "odds_ratio_loss": 0.23528532683849335, "rewards/accuracies": 1.0, "rewards/chosen": -0.01902521774172783, "rewards/margins": 0.0505596399307251, "rewards/rejected": -0.06958486139774323, "sft_loss": 0.3805043697357178, "step": 2550 }, { "epoch": 2.044, "grad_norm": 3.932026075807386, "learning_rate": 1.3936260969795778e-06, "logits/chosen": -0.8410050272941589, "logits/rejected": -0.9105769991874695, "logps/chosen": -0.6416537165641785, "logps/rejected": -1.6031726598739624, "loss": 0.5072, "odds_ratio_loss": 0.26725801825523376, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.032082684338092804, "rewards/margins": 0.04807594418525696, "rewards/rejected": -0.08015862852334976, "sft_loss": 0.6416537165641785, "step": 2555 }, { "epoch": 2.048, "grad_norm": 6.870678133966881, "learning_rate": 1.3832040268095589e-06, "logits/chosen": -0.6729310750961304, "logits/rejected": -0.8668330907821655, "logps/chosen": -0.40369588136672974, "logps/rejected": -1.3457889556884766, "loss": 0.5096, "odds_ratio_loss": 0.2014884054660797, "rewards/accuracies": 1.0, "rewards/chosen": -0.020184790715575218, "rewards/margins": 0.04710465669631958, "rewards/rejected": -0.06728944927453995, "sft_loss": 0.40369588136672974, "step": 2560 }, { "epoch": 2.052, "grad_norm": 7.587293716398404, "learning_rate": 1.3728061482764238e-06, "logits/chosen": -0.843582808971405, "logits/rejected": -0.9166304469108582, "logps/chosen": -0.604731559753418, "logps/rejected": -1.5095970630645752, "loss": 0.5172, "odds_ratio_loss": 0.2633668780326843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03023657761514187, "rewards/margins": 0.0452432855963707, "rewards/rejected": -0.07547986507415771, "sft_loss": 0.604731559753418, "step": 2565 }, { "epoch": 2.056, "grad_norm": 4.0093961868186625, "learning_rate": 1.362432686615316e-06, "logits/chosen": -0.9857913851737976, "logits/rejected": -0.7414765357971191, "logps/chosen": -0.46555963158607483, "logps/rejected": -1.3948227167129517, "loss": 0.4983, "odds_ratio_loss": 0.2733033299446106, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.023277979344129562, "rewards/margins": 0.046463146805763245, "rewards/rejected": -0.0697411373257637, "sft_loss": 0.46555963158607483, "step": 2570 }, { "epoch": 2.06, "grad_norm": 6.939466081035446, "learning_rate": 1.3520838665324704e-06, "logits/chosen": -1.1071815490722656, "logits/rejected": -0.817790687084198, "logps/chosen": -0.4723474085330963, "logps/rejected": -1.3803852796554565, "loss": 0.5202, "odds_ratio_loss": 0.256916880607605, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.023617371916770935, "rewards/margins": 0.04540189355611801, "rewards/rejected": -0.06901925802230835, "sft_loss": 0.4723474085330963, "step": 2575 }, { "epoch": 2.064, "grad_norm": 4.140170449016894, "learning_rate": 1.3417599122003464e-06, "logits/chosen": -0.450253427028656, "logits/rejected": -0.9482170343399048, "logps/chosen": -0.6843463778495789, "logps/rejected": -1.6336677074432373, "loss": 0.5651, "odds_ratio_loss": 0.27679651975631714, "rewards/accuracies": 1.0, "rewards/chosen": -0.03421732038259506, "rewards/margins": 0.047466062009334564, "rewards/rejected": -0.08168338239192963, "sft_loss": 0.6843463778495789, "step": 2580 }, { "epoch": 2.068, "grad_norm": 5.1100760056211545, "learning_rate": 1.3314610472527645e-06, "logits/chosen": -0.8113574981689453, "logits/rejected": -1.0102758407592773, "logps/chosen": -0.43412670493125916, "logps/rejected": -1.9312794208526611, "loss": 0.4464, "odds_ratio_loss": 0.12802623212337494, "rewards/accuracies": 1.0, "rewards/chosen": -0.02170633338391781, "rewards/margins": 0.07485763728618622, "rewards/rejected": -0.09656397253274918, "sft_loss": 0.43412670493125916, "step": 2585 }, { "epoch": 2.072, "grad_norm": 7.56195378391692, "learning_rate": 1.3211874947800747e-06, "logits/chosen": -0.7834498286247253, "logits/rejected": -0.8870722055435181, "logps/chosen": -0.2993749678134918, "logps/rejected": -1.3909119367599487, "loss": 0.4961, "odds_ratio_loss": 0.1264984905719757, "rewards/accuracies": 1.0, "rewards/chosen": -0.014968748204410076, "rewards/margins": 0.054576851427555084, "rewards/rejected": -0.06954559683799744, "sft_loss": 0.2993749678134918, "step": 2590 }, { "epoch": 2.076, "grad_norm": 5.796090053121948, "learning_rate": 1.3109394773243117e-06, "logits/chosen": -0.9734644889831543, "logits/rejected": -1.1090242862701416, "logps/chosen": -0.729141891002655, "logps/rejected": -1.5011317729949951, "loss": 0.5283, "odds_ratio_loss": 0.26416879892349243, "rewards/accuracies": 1.0, "rewards/chosen": -0.03645709902048111, "rewards/margins": 0.038599494844675064, "rewards/rejected": -0.07505659013986588, "sft_loss": 0.729141891002655, "step": 2595 }, { "epoch": 2.08, "grad_norm": 4.905492874489777, "learning_rate": 1.3007172168743854e-06, "logits/chosen": -0.6443847417831421, "logits/rejected": -0.6509965658187866, "logps/chosen": -0.4077952802181244, "logps/rejected": -1.5444676876068115, "loss": 0.4459, "odds_ratio_loss": 0.1849530041217804, "rewards/accuracies": 1.0, "rewards/chosen": -0.02038976177573204, "rewards/margins": 0.056833624839782715, "rewards/rejected": -0.07722338289022446, "sft_loss": 0.4077952802181244, "step": 2600 }, { "epoch": 2.084, "grad_norm": 8.38385935812006, "learning_rate": 1.2905209348612596e-06, "logits/chosen": -1.0582338571548462, "logits/rejected": -1.0338222980499268, "logps/chosen": -0.3466675281524658, "logps/rejected": -1.3382198810577393, "loss": 0.4997, "odds_ratio_loss": 0.17245514690876007, "rewards/accuracies": 1.0, "rewards/chosen": -0.01733337715268135, "rewards/margins": 0.049577612429857254, "rewards/rejected": -0.0669109970331192, "sft_loss": 0.3466675281524658, "step": 2605 }, { "epoch": 2.088, "grad_norm": 4.39436373774201, "learning_rate": 1.280350852153168e-06, "logits/chosen": -0.9197877049446106, "logits/rejected": -1.4192711114883423, "logps/chosen": -0.5219482183456421, "logps/rejected": -2.023850679397583, "loss": 0.4535, "odds_ratio_loss": 0.1568504273891449, "rewards/accuracies": 1.0, "rewards/chosen": -0.026097416877746582, "rewards/margins": 0.07509511709213257, "rewards/rejected": -0.10119253396987915, "sft_loss": 0.5219482183456421, "step": 2610 }, { "epoch": 2.092, "grad_norm": 31.805321381556926, "learning_rate": 1.2702071890508235e-06, "logits/chosen": -0.8824436068534851, "logits/rejected": -1.0908191204071045, "logps/chosen": -0.49740344285964966, "logps/rejected": -1.838228464126587, "loss": 0.5843, "odds_ratio_loss": 0.20825867354869843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.024870174005627632, "rewards/margins": 0.06704124808311462, "rewards/rejected": -0.0919114202260971, "sft_loss": 0.49740344285964966, "step": 2615 }, { "epoch": 2.096, "grad_norm": 5.572501524698811, "learning_rate": 1.260090165282645e-06, "logits/chosen": -1.0808252096176147, "logits/rejected": -0.8268556594848633, "logps/chosen": -0.5673039555549622, "logps/rejected": -1.536179780960083, "loss": 0.4523, "odds_ratio_loss": 0.24021944403648376, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.028365198522806168, "rewards/margins": 0.048443786799907684, "rewards/rejected": -0.07680898904800415, "sft_loss": 0.5673039555549622, "step": 2620 }, { "epoch": 2.1, "grad_norm": 5.981975287930625, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -1.054109811782837, "logits/rejected": -1.0586509704589844, "logps/chosen": -0.7665703296661377, "logps/rejected": -1.0670462846755981, "loss": 0.5098, "odds_ratio_loss": 0.5087045431137085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.038328517228364944, "rewards/margins": 0.015023794956505299, "rewards/rejected": -0.05335230752825737, "sft_loss": 0.7665703296661377, "step": 2625 }, { "epoch": 2.104, "grad_norm": 7.544463465155528, "learning_rate": 1.2399369117724582e-06, "logits/chosen": -1.0681202411651611, "logits/rejected": -1.0832048654556274, "logps/chosen": -0.5446698069572449, "logps/rejected": -1.5458736419677734, "loss": 0.4264, "odds_ratio_loss": 0.2728939950466156, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.027233490720391273, "rewards/margins": 0.05006019398570061, "rewards/rejected": -0.07729368656873703, "sft_loss": 0.5446698069572449, "step": 2630 }, { "epoch": 2.108, "grad_norm": 4.9815831235811325, "learning_rate": 1.2299011185830557e-06, "logits/chosen": -1.1409013271331787, "logits/rejected": -0.8681640625, "logps/chosen": -0.3343183994293213, "logps/rejected": -1.3352359533309937, "loss": 0.5583, "odds_ratio_loss": 0.17289471626281738, "rewards/accuracies": 1.0, "rewards/chosen": -0.016715919598937035, "rewards/margins": 0.05004587024450302, "rewards/rejected": -0.0667618066072464, "sft_loss": 0.3343183994293213, "step": 2635 }, { "epoch": 2.112, "grad_norm": 6.73634607533909, "learning_rate": 1.2198928378235717e-06, "logits/chosen": -1.1178267002105713, "logits/rejected": -0.7387592196464539, "logps/chosen": -0.44447723031044006, "logps/rejected": -1.3598803281784058, "loss": 0.5268, "odds_ratio_loss": 0.21014420688152313, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.022223861888051033, "rewards/margins": 0.04577016085386276, "rewards/rejected": -0.06799402087926865, "sft_loss": 0.44447723031044006, "step": 2640 }, { "epoch": 2.116, "grad_norm": 4.379209925386878, "learning_rate": 1.2099122862898214e-06, "logits/chosen": -0.9532085657119751, "logits/rejected": -1.0121229887008667, "logps/chosen": -0.4848550856113434, "logps/rejected": -1.2674915790557861, "loss": 0.4112, "odds_ratio_loss": 0.23585955798625946, "rewards/accuracies": 1.0, "rewards/chosen": -0.02424275316298008, "rewards/margins": 0.039131827652454376, "rewards/rejected": -0.0633745864033699, "sft_loss": 0.4848550856113434, "step": 2645 }, { "epoch": 2.12, "grad_norm": 5.5875611806210825, "learning_rate": 1.1999596801769617e-06, "logits/chosen": -0.9600692987442017, "logits/rejected": -0.9517749547958374, "logps/chosen": -0.24040523171424866, "logps/rejected": -1.3553459644317627, "loss": 0.5047, "odds_ratio_loss": 0.10732688009738922, "rewards/accuracies": 1.0, "rewards/chosen": -0.012020261958241463, "rewards/margins": 0.05574704334139824, "rewards/rejected": -0.06776730716228485, "sft_loss": 0.24040523171424866, "step": 2650 }, { "epoch": 2.124, "grad_norm": 5.665887523645101, "learning_rate": 1.1900352350748026e-06, "logits/chosen": -1.3023062944412231, "logits/rejected": -0.6389235258102417, "logps/chosen": -0.3508966863155365, "logps/rejected": -1.516701102256775, "loss": 0.4761, "odds_ratio_loss": 0.20118489861488342, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.017544833943247795, "rewards/margins": 0.05829022452235222, "rewards/rejected": -0.07583504915237427, "sft_loss": 0.3508966863155365, "step": 2655 }, { "epoch": 2.128, "grad_norm": 4.849761975833439, "learning_rate": 1.1801391659631423e-06, "logits/chosen": -0.7300786375999451, "logits/rejected": -1.2495789527893066, "logps/chosen": -0.7808480262756348, "logps/rejected": -1.481135606765747, "loss": 0.5719, "odds_ratio_loss": 0.3138294816017151, "rewards/accuracies": 1.0, "rewards/chosen": -0.0390424020588398, "rewards/margins": 0.035014379769563675, "rewards/rejected": -0.07405678182840347, "sft_loss": 0.7808480262756348, "step": 2660 }, { "epoch": 2.132, "grad_norm": 7.470137375834024, "learning_rate": 1.170271687207106e-06, "logits/chosen": -0.8559409379959106, "logits/rejected": -0.9146011471748352, "logps/chosen": -0.42539578676223755, "logps/rejected": -1.569939374923706, "loss": 0.505, "odds_ratio_loss": 0.17492997646331787, "rewards/accuracies": 1.0, "rewards/chosen": -0.0212697871029377, "rewards/margins": 0.05722718685865402, "rewards/rejected": -0.07849697768688202, "sft_loss": 0.42539578676223755, "step": 2665 }, { "epoch": 2.136, "grad_norm": 4.739681467147201, "learning_rate": 1.160433012552508e-06, "logits/chosen": -0.571463942527771, "logits/rejected": -1.121757984161377, "logps/chosen": -0.5185319185256958, "logps/rejected": -1.5073848962783813, "loss": 0.5349, "odds_ratio_loss": 0.20745344460010529, "rewards/accuracies": 1.0, "rewards/chosen": -0.025926601141691208, "rewards/margins": 0.04944263771176338, "rewards/rejected": -0.07536924630403519, "sft_loss": 0.5185319185256958, "step": 2670 }, { "epoch": 2.14, "grad_norm": 4.496694368173494, "learning_rate": 1.1506233551212186e-06, "logits/chosen": -0.7737411856651306, "logits/rejected": -1.0046793222427368, "logps/chosen": -0.5965171456336975, "logps/rejected": -1.5169528722763062, "loss": 0.5752, "odds_ratio_loss": 0.23371455073356628, "rewards/accuracies": 1.0, "rewards/chosen": -0.029825860634446144, "rewards/margins": 0.04602178931236267, "rewards/rejected": -0.07584764063358307, "sft_loss": 0.5965171456336975, "step": 2675 }, { "epoch": 2.144, "grad_norm": 5.818039980551584, "learning_rate": 1.1408429274065418e-06, "logits/chosen": -1.1345255374908447, "logits/rejected": -1.221318244934082, "logps/chosen": -0.3457716703414917, "logps/rejected": -1.645448088645935, "loss": 0.3843, "odds_ratio_loss": 0.146096870303154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.017288584262132645, "rewards/margins": 0.06498382240533829, "rewards/rejected": -0.08227241039276123, "sft_loss": 0.3457716703414917, "step": 2680 }, { "epoch": 2.148, "grad_norm": 7.617489619432182, "learning_rate": 1.1310919412686248e-06, "logits/chosen": -0.6363608837127686, "logits/rejected": -0.9788458943367004, "logps/chosen": -0.31420546770095825, "logps/rejected": -2.0080409049987793, "loss": 0.4396, "odds_ratio_loss": 0.08285339921712875, "rewards/accuracies": 1.0, "rewards/chosen": -0.015710271894931793, "rewards/margins": 0.08469177782535553, "rewards/rejected": -0.10040205717086792, "sft_loss": 0.31420546770095825, "step": 2685 }, { "epoch": 2.152, "grad_norm": 5.306338004940538, "learning_rate": 1.1213706079298566e-06, "logits/chosen": -0.6837140917778015, "logits/rejected": -1.3568503856658936, "logps/chosen": -0.4734787046909332, "logps/rejected": -1.4761061668395996, "loss": 0.4861, "odds_ratio_loss": 0.20139172673225403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02367393672466278, "rewards/margins": 0.050131380558013916, "rewards/rejected": -0.0738053172826767, "sft_loss": 0.4734787046909332, "step": 2690 }, { "epoch": 2.156, "grad_norm": 5.238611883596988, "learning_rate": 1.1116791379703032e-06, "logits/chosen": -0.6437733769416809, "logits/rejected": -0.8825413584709167, "logps/chosen": -0.3527492880821228, "logps/rejected": -2.517010450363159, "loss": 0.4288, "odds_ratio_loss": 0.10402262210845947, "rewards/accuracies": 1.0, "rewards/chosen": -0.0176374651491642, "rewards/margins": 0.10821305215358734, "rewards/rejected": -0.12585052847862244, "sft_loss": 0.3527492880821228, "step": 2695 }, { "epoch": 2.16, "grad_norm": 5.037423172133621, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -0.7074111700057983, "logits/rejected": -1.1180449724197388, "logps/chosen": -0.706173300743103, "logps/rejected": -1.7855567932128906, "loss": 0.5407, "odds_ratio_loss": 0.2965608835220337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03530866652727127, "rewards/margins": 0.05396916717290878, "rewards/rejected": -0.08927784115076065, "sft_loss": 0.706173300743103, "step": 2700 }, { "epoch": 2.164, "grad_norm": 11.625726843094434, "learning_rate": 1.0923866272700845e-06, "logits/chosen": -0.6679056286811829, "logits/rejected": -0.8790310025215149, "logps/chosen": -0.37520498037338257, "logps/rejected": -1.725489854812622, "loss": 0.4112, "odds_ratio_loss": 0.1370328813791275, "rewards/accuracies": 1.0, "rewards/chosen": -0.018760250881314278, "rewards/margins": 0.06751424074172974, "rewards/rejected": -0.08627448976039886, "sft_loss": 0.37520498037338257, "step": 2705 }, { "epoch": 2.168, "grad_norm": 5.751387431303456, "learning_rate": 1.0827860044369226e-06, "logits/chosen": -1.1543610095977783, "logits/rejected": -1.1389491558074951, "logps/chosen": -0.8079617619514465, "logps/rejected": -1.881744384765625, "loss": 0.653, "odds_ratio_loss": 0.21621274948120117, "rewards/accuracies": 1.0, "rewards/chosen": -0.04039808362722397, "rewards/margins": 0.0536891333758831, "rewards/rejected": -0.09408722817897797, "sft_loss": 0.8079617619514465, "step": 2710 }, { "epoch": 2.172, "grad_norm": 13.56821195989023, "learning_rate": 1.073216080788921e-06, "logits/chosen": -0.7350510358810425, "logits/rejected": -0.9062551259994507, "logps/chosen": -0.41547298431396484, "logps/rejected": -1.4787991046905518, "loss": 0.4839, "odds_ratio_loss": 0.17242933809757233, "rewards/accuracies": 1.0, "rewards/chosen": -0.02077365107834339, "rewards/margins": 0.053166307508945465, "rewards/rejected": -0.07393995672464371, "sft_loss": 0.41547298431396484, "step": 2715 }, { "epoch": 2.176, "grad_norm": 4.723019200073755, "learning_rate": 1.06367706362636e-06, "logits/chosen": -1.2677191495895386, "logits/rejected": -1.3101189136505127, "logps/chosen": -0.5442667603492737, "logps/rejected": -1.3093074560165405, "loss": 0.4889, "odds_ratio_loss": 0.2581162452697754, "rewards/accuracies": 1.0, "rewards/chosen": -0.027213340625166893, "rewards/margins": 0.03825204446911812, "rewards/rejected": -0.06546537578105927, "sft_loss": 0.5442667603492737, "step": 2720 }, { "epoch": 2.18, "grad_norm": 6.320991135595779, "learning_rate": 1.0541691595800338e-06, "logits/chosen": -0.7713707089424133, "logits/rejected": -0.8052916526794434, "logps/chosen": -0.8031466603279114, "logps/rejected": -1.449881911277771, "loss": 0.4879, "odds_ratio_loss": 0.34610167145729065, "rewards/accuracies": 1.0, "rewards/chosen": -0.04015733301639557, "rewards/margins": 0.0323367640376091, "rewards/rejected": -0.07249408960342407, "sft_loss": 0.8031466603279114, "step": 2725 }, { "epoch": 2.184, "grad_norm": 7.667541694530943, "learning_rate": 1.0446925746067768e-06, "logits/chosen": -0.735578179359436, "logits/rejected": -1.290317416191101, "logps/chosen": -0.4031465947628021, "logps/rejected": -2.864250898361206, "loss": 0.4889, "odds_ratio_loss": 0.11299661546945572, "rewards/accuracies": 1.0, "rewards/chosen": -0.020157333463430405, "rewards/margins": 0.12305520474910736, "rewards/rejected": -0.14321252703666687, "sft_loss": 0.4031465947628021, "step": 2730 }, { "epoch": 2.188, "grad_norm": 8.598577539363314, "learning_rate": 1.0352475139849993e-06, "logits/chosen": -1.1117980480194092, "logits/rejected": -1.2077891826629639, "logps/chosen": -0.6618624329566956, "logps/rejected": -1.303166389465332, "loss": 0.5357, "odds_ratio_loss": 0.4117598533630371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03309311717748642, "rewards/margins": 0.032065197825431824, "rewards/rejected": -0.06515831500291824, "sft_loss": 0.6618624329566956, "step": 2735 }, { "epoch": 2.192, "grad_norm": 5.479651626100435, "learning_rate": 1.0258341823102418e-06, "logits/chosen": -0.873387336730957, "logits/rejected": -1.0599385499954224, "logps/chosen": -0.8911870121955872, "logps/rejected": -1.9314569234848022, "loss": 0.5608, "odds_ratio_loss": 0.3484743535518646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04455935209989548, "rewards/margins": 0.052013497799634933, "rewards/rejected": -0.09657285362482071, "sft_loss": 0.8911870121955872, "step": 2740 }, { "epoch": 2.196, "grad_norm": 6.711233602443305, "learning_rate": 1.0164527834907468e-06, "logits/chosen": -1.1073827743530273, "logits/rejected": -0.8712652325630188, "logps/chosen": -0.3501274287700653, "logps/rejected": -1.0953724384307861, "loss": 0.463, "odds_ratio_loss": 0.2384282350540161, "rewards/accuracies": 1.0, "rewards/chosen": -0.017506374046206474, "rewards/margins": 0.03726224601268768, "rewards/rejected": -0.05476861447095871, "sft_loss": 0.3501274287700653, "step": 2745 }, { "epoch": 2.2, "grad_norm": 6.323686554397781, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -0.5682858228683472, "logits/rejected": -1.2733685970306396, "logps/chosen": -0.5779908895492554, "logps/rejected": -1.2694551944732666, "loss": 0.4752, "odds_ratio_loss": 0.37383320927619934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0288995448499918, "rewards/margins": 0.03457321599125862, "rewards/rejected": -0.06347276270389557, "sft_loss": 0.5779908895492554, "step": 2750 }, { "epoch": 2.204, "grad_norm": 5.639287689051844, "learning_rate": 9.977865965875091e-07, "logits/chosen": -0.7087677717208862, "logits/rejected": -0.8449762463569641, "logps/chosen": -0.5380462408065796, "logps/rejected": -1.518744707107544, "loss": 0.508, "odds_ratio_loss": 0.23005056381225586, "rewards/accuracies": 1.0, "rewards/chosen": -0.02690231241285801, "rewards/margins": 0.049034927040338516, "rewards/rejected": -0.07593724131584167, "sft_loss": 0.5380462408065796, "step": 2755 }, { "epoch": 2.208, "grad_norm": 6.305391688848771, "learning_rate": 9.88502212844063e-07, "logits/chosen": -0.7394359111785889, "logits/rejected": -1.2294960021972656, "logps/chosen": -0.6039851307868958, "logps/rejected": -1.388856291770935, "loss": 0.5097, "odds_ratio_loss": 0.2779824733734131, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030199263244867325, "rewards/margins": 0.039243556559085846, "rewards/rejected": -0.06944280862808228, "sft_loss": 0.6039851307868958, "step": 2760 }, { "epoch": 2.212, "grad_norm": 5.741952540098281, "learning_rate": 9.792505706277136e-07, "logits/chosen": -0.8889573216438293, "logits/rejected": -0.9258907437324524, "logps/chosen": -0.5408663749694824, "logps/rejected": -1.488094449043274, "loss": 0.5131, "odds_ratio_loss": 0.21949462592601776, "rewards/accuracies": 1.0, "rewards/chosen": -0.02704331837594509, "rewards/margins": 0.047361403703689575, "rewards/rejected": -0.07440472394227982, "sft_loss": 0.5408663749694824, "step": 2765 }, { "epoch": 2.216, "grad_norm": 11.523508801713763, "learning_rate": 9.700318703442437e-07, "logits/chosen": -1.087032437324524, "logits/rejected": -0.9947022199630737, "logps/chosen": -0.47177377343177795, "logps/rejected": -1.4707008600234985, "loss": 0.4983, "odds_ratio_loss": 0.2322104275226593, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02358868531882763, "rewards/margins": 0.04994635283946991, "rewards/rejected": -0.07353504002094269, "sft_loss": 0.47177377343177795, "step": 2770 }, { "epoch": 2.22, "grad_norm": 4.462198076439179, "learning_rate": 9.608463116858544e-07, "logits/chosen": -0.736533522605896, "logits/rejected": -0.829948902130127, "logps/chosen": -0.6742941737174988, "logps/rejected": -1.4203133583068848, "loss": 0.5412, "odds_ratio_loss": 0.3509899079799652, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03371470421552658, "rewards/margins": 0.037300966680049896, "rewards/rejected": -0.07101567089557648, "sft_loss": 0.6742941737174988, "step": 2775 }, { "epoch": 2.224, "grad_norm": 4.228429825727303, "learning_rate": 9.516940936268504e-07, "logits/chosen": -1.1075184345245361, "logits/rejected": -1.0531575679779053, "logps/chosen": -0.21282517910003662, "logps/rejected": -1.0983449220657349, "loss": 0.4243, "odds_ratio_loss": 0.24119925498962402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.010641258209943771, "rewards/margins": 0.04427598416805267, "rewards/rejected": -0.05491724610328674, "sft_loss": 0.21282517910003662, "step": 2780 }, { "epoch": 2.228, "grad_norm": 5.45480141160989, "learning_rate": 9.4257541441932e-07, "logits/chosen": -0.9974383115768433, "logits/rejected": -1.3215069770812988, "logps/chosen": -0.42205625772476196, "logps/rejected": -1.7309017181396484, "loss": 0.3804, "odds_ratio_loss": 0.16683904826641083, "rewards/accuracies": 1.0, "rewards/chosen": -0.02110280841588974, "rewards/margins": 0.0654422789812088, "rewards/rejected": -0.08654508739709854, "sft_loss": 0.42205625772476196, "step": 2785 }, { "epoch": 2.232, "grad_norm": 9.178748455728739, "learning_rate": 9.334904715888496e-07, "logits/chosen": -1.0844497680664062, "logits/rejected": -1.070488691329956, "logps/chosen": -0.32815462350845337, "logps/rejected": -1.7352529764175415, "loss": 0.4216, "odds_ratio_loss": 0.09935733675956726, "rewards/accuracies": 1.0, "rewards/chosen": -0.016407731920480728, "rewards/margins": 0.07035491615533829, "rewards/rejected": -0.08676265180110931, "sft_loss": 0.32815462350845337, "step": 2790 }, { "epoch": 2.2359999999999998, "grad_norm": 4.558311798079199, "learning_rate": 9.244394619302338e-07, "logits/chosen": -0.9466593861579895, "logits/rejected": -0.8899604678153992, "logps/chosen": -0.7910462617874146, "logps/rejected": -1.691467523574829, "loss": 0.5247, "odds_ratio_loss": 0.33903276920318604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03955231234431267, "rewards/margins": 0.04502106085419655, "rewards/rejected": -0.08457337319850922, "sft_loss": 0.7910462617874146, "step": 2795 }, { "epoch": 2.24, "grad_norm": 5.935563711192816, "learning_rate": 9.154225815032242e-07, "logits/chosen": -0.7447048425674438, "logits/rejected": -1.309199333190918, "logps/chosen": -0.4879288673400879, "logps/rejected": -1.4591110944747925, "loss": 0.5058, "odds_ratio_loss": 0.22252912819385529, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.024396440014243126, "rewards/margins": 0.04855911061167717, "rewards/rejected": -0.07295555621385574, "sft_loss": 0.4879288673400879, "step": 2800 }, { "epoch": 2.2439999999999998, "grad_norm": 5.083831381993147, "learning_rate": 9.064400256282757e-07, "logits/chosen": -0.4584059715270996, "logits/rejected": -0.9872250556945801, "logps/chosen": -0.5474129915237427, "logps/rejected": -1.5436441898345947, "loss": 0.4659, "odds_ratio_loss": 0.20902732014656067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.027370650321245193, "rewards/margins": 0.049811553210020065, "rewards/rejected": -0.07718220353126526, "sft_loss": 0.5474129915237427, "step": 2805 }, { "epoch": 2.248, "grad_norm": 6.37725598665606, "learning_rate": 8.974919888823164e-07, "logits/chosen": -1.1903154850006104, "logits/rejected": -0.9519041180610657, "logps/chosen": -0.407247930765152, "logps/rejected": -1.9125579595565796, "loss": 0.4918, "odds_ratio_loss": 0.17733065783977509, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02036239765584469, "rewards/margins": 0.07526550441980362, "rewards/rejected": -0.09562790393829346, "sft_loss": 0.407247930765152, "step": 2810 }, { "epoch": 2.252, "grad_norm": 23.791980041949664, "learning_rate": 8.885786650945333e-07, "logits/chosen": -0.9534207582473755, "logits/rejected": -0.8568700551986694, "logps/chosen": -0.4399814009666443, "logps/rejected": -1.6407368183135986, "loss": 0.4943, "odds_ratio_loss": 0.2155378758907318, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.021999072283506393, "rewards/margins": 0.0600377693772316, "rewards/rejected": -0.08203683793544769, "sft_loss": 0.4399814009666443, "step": 2815 }, { "epoch": 2.2560000000000002, "grad_norm": 9.509659444717578, "learning_rate": 8.797002473421729e-07, "logits/chosen": -0.47383326292037964, "logits/rejected": -0.8560094833374023, "logps/chosen": -0.532326340675354, "logps/rejected": -1.574268102645874, "loss": 0.4286, "odds_ratio_loss": 0.2277209758758545, "rewards/accuracies": 1.0, "rewards/chosen": -0.02661631628870964, "rewards/margins": 0.05209709331393242, "rewards/rejected": -0.07871340215206146, "sft_loss": 0.532326340675354, "step": 2820 }, { "epoch": 2.26, "grad_norm": 5.1984524732684765, "learning_rate": 8.708569279463622e-07, "logits/chosen": -0.8231816291809082, "logits/rejected": -1.3245340585708618, "logps/chosen": -0.7534009218215942, "logps/rejected": -1.5175855159759521, "loss": 0.4659, "odds_ratio_loss": 0.22750845551490784, "rewards/accuracies": 1.0, "rewards/chosen": -0.03767004236578941, "rewards/margins": 0.03820923715829849, "rewards/rejected": -0.0758792832493782, "sft_loss": 0.7534009218215942, "step": 2825 }, { "epoch": 2.2640000000000002, "grad_norm": 5.302329927455291, "learning_rate": 8.620488984679378e-07, "logits/chosen": -1.174373984336853, "logits/rejected": -0.9231562614440918, "logps/chosen": -0.49920812249183655, "logps/rejected": -1.5488044023513794, "loss": 0.4205, "odds_ratio_loss": 0.1963232159614563, "rewards/accuracies": 1.0, "rewards/chosen": -0.024960406124591827, "rewards/margins": 0.0524798147380352, "rewards/rejected": -0.07744021713733673, "sft_loss": 0.49920812249183655, "step": 2830 }, { "epoch": 2.268, "grad_norm": 8.784170043250743, "learning_rate": 8.532763497032987e-07, "logits/chosen": -0.7181037068367004, "logits/rejected": -1.2467901706695557, "logps/chosen": -0.5864667892456055, "logps/rejected": -1.448925256729126, "loss": 0.5551, "odds_ratio_loss": 0.26310136914253235, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.029323343187570572, "rewards/margins": 0.043122924864292145, "rewards/rejected": -0.07244626432657242, "sft_loss": 0.5864667892456055, "step": 2835 }, { "epoch": 2.2720000000000002, "grad_norm": 7.046247884710958, "learning_rate": 8.445394716802754e-07, "logits/chosen": -0.8855420351028442, "logits/rejected": -1.3163471221923828, "logps/chosen": -0.4142325818538666, "logps/rejected": -1.6013071537017822, "loss": 0.468, "odds_ratio_loss": 0.14958259463310242, "rewards/accuracies": 1.0, "rewards/chosen": -0.020711630582809448, "rewards/margins": 0.05935372784733772, "rewards/rejected": -0.08006535470485687, "sft_loss": 0.4142325818538666, "step": 2840 }, { "epoch": 2.276, "grad_norm": 5.056482274476605, "learning_rate": 8.35838453654009e-07, "logits/chosen": -0.7100471258163452, "logits/rejected": -0.6780123114585876, "logps/chosen": -0.6010003685951233, "logps/rejected": -1.8489612340927124, "loss": 0.4035, "odds_ratio_loss": 0.2834250032901764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.030050018802285194, "rewards/margins": 0.0623980388045311, "rewards/rejected": -0.09244807064533234, "sft_loss": 0.6010003685951233, "step": 2845 }, { "epoch": 2.2800000000000002, "grad_norm": 5.899418760006874, "learning_rate": 8.271734841028553e-07, "logits/chosen": -0.8226686716079712, "logits/rejected": -1.2473328113555908, "logps/chosen": -0.7500003576278687, "logps/rejected": -1.7512264251708984, "loss": 0.4737, "odds_ratio_loss": 0.24083253741264343, "rewards/accuracies": 1.0, "rewards/chosen": -0.03750002384185791, "rewards/margins": 0.05006130784749985, "rewards/rejected": -0.08756133168935776, "sft_loss": 0.7500003576278687, "step": 2850 }, { "epoch": 2.284, "grad_norm": 6.588276147321622, "learning_rate": 8.185447507243e-07, "logits/chosen": -0.8183411359786987, "logits/rejected": -0.9986063838005066, "logps/chosen": -0.5513178110122681, "logps/rejected": -1.8742460012435913, "loss": 0.5713, "odds_ratio_loss": 0.18965277075767517, "rewards/accuracies": 1.0, "rewards/chosen": -0.027565892785787582, "rewards/margins": 0.06614641845226288, "rewards/rejected": -0.09371231496334076, "sft_loss": 0.5513178110122681, "step": 2855 }, { "epoch": 2.288, "grad_norm": 4.435394123744806, "learning_rate": 8.099524404308948e-07, "logits/chosen": -0.8991169929504395, "logits/rejected": -0.829195499420166, "logps/chosen": -0.4634367525577545, "logps/rejected": -1.410571813583374, "loss": 0.4853, "odds_ratio_loss": 0.23965725302696228, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.023171838372945786, "rewards/margins": 0.047356750816106796, "rewards/rejected": -0.07052858918905258, "sft_loss": 0.4634367525577545, "step": 2860 }, { "epoch": 2.292, "grad_norm": 5.612134177673955, "learning_rate": 8.013967393462094e-07, "logits/chosen": -0.6299402713775635, "logits/rejected": -0.7464720010757446, "logps/chosen": -0.24951812624931335, "logps/rejected": -1.7939624786376953, "loss": 0.4291, "odds_ratio_loss": 0.07172398269176483, "rewards/accuracies": 1.0, "rewards/chosen": -0.012475905939936638, "rewards/margins": 0.07722222059965134, "rewards/rejected": -0.08969812840223312, "sft_loss": 0.24951812624931335, "step": 2865 }, { "epoch": 2.296, "grad_norm": 5.034253606387231, "learning_rate": 7.928778328007918e-07, "logits/chosen": -0.8041081428527832, "logits/rejected": -0.8288570642471313, "logps/chosen": -0.4491206109523773, "logps/rejected": -1.529018521308899, "loss": 0.4568, "odds_ratio_loss": 0.1833478957414627, "rewards/accuracies": 1.0, "rewards/chosen": -0.022456031292676926, "rewards/margins": 0.05399489402770996, "rewards/rejected": -0.07645092159509659, "sft_loss": 0.4491206109523773, "step": 2870 }, { "epoch": 2.3, "grad_norm": 4.4545637866951004, "learning_rate": 7.843959053281663e-07, "logits/chosen": -0.6113277673721313, "logits/rejected": -1.2742953300476074, "logps/chosen": -0.3947676420211792, "logps/rejected": -1.7495063543319702, "loss": 0.4967, "odds_ratio_loss": 0.12505455315113068, "rewards/accuracies": 1.0, "rewards/chosen": -0.01973838172852993, "rewards/margins": 0.06773693859577179, "rewards/rejected": -0.08747532218694687, "sft_loss": 0.3947676420211792, "step": 2875 }, { "epoch": 2.304, "grad_norm": 4.327743928984781, "learning_rate": 7.759511406608255e-07, "logits/chosen": -0.8660491108894348, "logits/rejected": -0.9698010683059692, "logps/chosen": -0.4570358395576477, "logps/rejected": -1.4516005516052246, "loss": 0.4305, "odds_ratio_loss": 0.17015303671360016, "rewards/accuracies": 1.0, "rewards/chosen": -0.022851794958114624, "rewards/margins": 0.049728237092494965, "rewards/rejected": -0.07258002460002899, "sft_loss": 0.4570358395576477, "step": 2880 }, { "epoch": 2.308, "grad_norm": 4.831001325484035, "learning_rate": 7.675437217262571e-07, "logits/chosen": -0.9820590019226074, "logits/rejected": -1.3118226528167725, "logps/chosen": -0.6205871105194092, "logps/rejected": -1.3952069282531738, "loss": 0.5001, "odds_ratio_loss": 0.38392138481140137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0310293547809124, "rewards/margins": 0.038730986416339874, "rewards/rejected": -0.06976033747196198, "sft_loss": 0.6205871105194092, "step": 2885 }, { "epoch": 2.312, "grad_norm": 12.399106444555985, "learning_rate": 7.591738306429769e-07, "logits/chosen": -0.8063400387763977, "logits/rejected": -0.97333824634552, "logps/chosen": -0.38419461250305176, "logps/rejected": -1.590563178062439, "loss": 0.5036, "odds_ratio_loss": 0.1262492537498474, "rewards/accuracies": 1.0, "rewards/chosen": -0.019209731370210648, "rewards/margins": 0.06031842902302742, "rewards/rejected": -0.07952816039323807, "sft_loss": 0.38419461250305176, "step": 2890 }, { "epoch": 2.316, "grad_norm": 5.421465099897958, "learning_rate": 7.508416487165862e-07, "logits/chosen": -0.9876044392585754, "logits/rejected": -0.8423392176628113, "logps/chosen": -0.2360585480928421, "logps/rejected": -1.1289315223693848, "loss": 0.3904, "odds_ratio_loss": 0.1510620415210724, "rewards/accuracies": 1.0, "rewards/chosen": -0.01180292759090662, "rewards/margins": 0.04464365169405937, "rewards/rejected": -0.056446582078933716, "sft_loss": 0.2360585480928421, "step": 2895 }, { "epoch": 2.32, "grad_norm": 3.870106030569181, "learning_rate": 7.425473564358457e-07, "logits/chosen": -0.6248672604560852, "logits/rejected": -1.0867903232574463, "logps/chosen": -0.4465237557888031, "logps/rejected": -1.3280917406082153, "loss": 0.4823, "odds_ratio_loss": 0.23933851718902588, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.022326190024614334, "rewards/margins": 0.04407840222120285, "rewards/rejected": -0.06640458852052689, "sft_loss": 0.4465237557888031, "step": 2900 }, { "epoch": 2.324, "grad_norm": 6.574323626048454, "learning_rate": 7.342911334687619e-07, "logits/chosen": -0.8632572293281555, "logits/rejected": -1.020462989807129, "logps/chosen": -0.429226815700531, "logps/rejected": -1.3181707859039307, "loss": 0.488, "odds_ratio_loss": 0.21809737384319305, "rewards/accuracies": 1.0, "rewards/chosen": -0.02146134153008461, "rewards/margins": 0.044447191059589386, "rewards/rejected": -0.0659085363149643, "sft_loss": 0.429226815700531, "step": 2905 }, { "epoch": 2.328, "grad_norm": 6.6512572207835525, "learning_rate": 7.260731586586983e-07, "logits/chosen": -0.9791855812072754, "logits/rejected": -0.8407464027404785, "logps/chosen": -0.16527701914310455, "logps/rejected": -1.412431001663208, "loss": 0.4797, "odds_ratio_loss": 0.13517996668815613, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.008263850584626198, "rewards/margins": 0.06235770136117935, "rewards/rejected": -0.070621557533741, "sft_loss": 0.16527701914310455, "step": 2910 }, { "epoch": 2.332, "grad_norm": 8.3662208877826, "learning_rate": 7.178936100204994e-07, "logits/chosen": -0.7169278860092163, "logits/rejected": -1.1434614658355713, "logps/chosen": -0.619706392288208, "logps/rejected": -1.5916235446929932, "loss": 0.5525, "odds_ratio_loss": 0.2509163022041321, "rewards/accuracies": 1.0, "rewards/chosen": -0.03098532184958458, "rewards/margins": 0.0485958568751812, "rewards/rejected": -0.07958117872476578, "sft_loss": 0.619706392288208, "step": 2915 }, { "epoch": 2.336, "grad_norm": 21.850965502390842, "learning_rate": 7.097526647366379e-07, "logits/chosen": -0.9213175773620605, "logits/rejected": -0.9148879051208496, "logps/chosen": -0.4506239891052246, "logps/rejected": -1.790827751159668, "loss": 0.5104, "odds_ratio_loss": 0.1461685597896576, "rewards/accuracies": 1.0, "rewards/chosen": -0.02253119833767414, "rewards/margins": 0.06701019406318665, "rewards/rejected": -0.08954139798879623, "sft_loss": 0.4506239891052246, "step": 2920 }, { "epoch": 2.34, "grad_norm": 4.021479030561525, "learning_rate": 7.016504991533727e-07, "logits/chosen": -0.9241275787353516, "logits/rejected": -1.168180227279663, "logps/chosen": -0.34740298986434937, "logps/rejected": -1.5019714832305908, "loss": 0.3721, "odds_ratio_loss": 0.15549840033054352, "rewards/accuracies": 1.0, "rewards/chosen": -0.017370149493217468, "rewards/margins": 0.057728420943021774, "rewards/rejected": -0.07509858161211014, "sft_loss": 0.34740298986434937, "step": 2925 }, { "epoch": 2.344, "grad_norm": 5.884610068850595, "learning_rate": 6.935872887769299e-07, "logits/chosen": -0.7997936010360718, "logits/rejected": -0.9736310839653015, "logps/chosen": -0.5550605058670044, "logps/rejected": -1.6123679876327515, "loss": 0.4905, "odds_ratio_loss": 0.2296057492494583, "rewards/accuracies": 1.0, "rewards/chosen": -0.027753029018640518, "rewards/margins": 0.05286537855863571, "rewards/rejected": -0.08061840385198593, "sft_loss": 0.5550605058670044, "step": 2930 }, { "epoch": 2.348, "grad_norm": 4.469717951127329, "learning_rate": 6.855632082697045e-07, "logits/chosen": -0.8227846026420593, "logits/rejected": -1.05718195438385, "logps/chosen": -0.5576878786087036, "logps/rejected": -1.503408670425415, "loss": 0.5521, "odds_ratio_loss": 0.23267917335033417, "rewards/accuracies": 1.0, "rewards/chosen": -0.02788439765572548, "rewards/margins": 0.04728604108095169, "rewards/rejected": -0.07517042756080627, "sft_loss": 0.5576878786087036, "step": 2935 }, { "epoch": 2.352, "grad_norm": 4.357069152595027, "learning_rate": 6.775784314464717e-07, "logits/chosen": -0.7742995023727417, "logits/rejected": -0.7836570143699646, "logps/chosen": -0.6989033222198486, "logps/rejected": -1.754209280014038, "loss": 0.4823, "odds_ratio_loss": 0.251584529876709, "rewards/accuracies": 1.0, "rewards/chosen": -0.03494516760110855, "rewards/margins": 0.05276529863476753, "rewards/rejected": -0.08771046996116638, "sft_loss": 0.6989033222198486, "step": 2940 }, { "epoch": 2.356, "grad_norm": 5.097226495202278, "learning_rate": 6.696331312706245e-07, "logits/chosen": -0.8427531123161316, "logits/rejected": -0.6410783529281616, "logps/chosen": -0.4761292338371277, "logps/rejected": -1.3721054792404175, "loss": 0.4982, "odds_ratio_loss": 0.25603288412094116, "rewards/accuracies": 1.0, "rewards/chosen": -0.023806463927030563, "rewards/margins": 0.04479881748557091, "rewards/rejected": -0.06860526651144028, "sft_loss": 0.4761292338371277, "step": 2945 }, { "epoch": 2.36, "grad_norm": 9.533319054768027, "learning_rate": 6.617274798504286e-07, "logits/chosen": -1.0047314167022705, "logits/rejected": -1.1931557655334473, "logps/chosen": -0.30853766202926636, "logps/rejected": -1.573937177658081, "loss": 0.4144, "odds_ratio_loss": 0.1045946478843689, "rewards/accuracies": 1.0, "rewards/chosen": -0.015426883473992348, "rewards/margins": 0.0632699728012085, "rewards/rejected": -0.07869686186313629, "sft_loss": 0.30853766202926636, "step": 2950 }, { "epoch": 2.364, "grad_norm": 5.794627406886997, "learning_rate": 6.538616484352902e-07, "logits/chosen": -0.6078363656997681, "logits/rejected": -0.9947171211242676, "logps/chosen": -0.5081497430801392, "logps/rejected": -1.6594918966293335, "loss": 0.4509, "odds_ratio_loss": 0.18001045286655426, "rewards/accuracies": 1.0, "rewards/chosen": -0.025407487526535988, "rewards/margins": 0.057567108422517776, "rewards/rejected": -0.08297459036111832, "sft_loss": 0.5081497430801392, "step": 2955 }, { "epoch": 2.368, "grad_norm": 8.384862451811653, "learning_rate": 6.460358074120518e-07, "logits/chosen": -1.0173108577728271, "logits/rejected": -0.7680644392967224, "logps/chosen": -0.2868219017982483, "logps/rejected": -1.2532503604888916, "loss": 0.4436, "odds_ratio_loss": 0.1625426560640335, "rewards/accuracies": 1.0, "rewards/chosen": -0.014341096393764019, "rewards/margins": 0.04832141846418381, "rewards/rejected": -0.0626625195145607, "sft_loss": 0.2868219017982483, "step": 2960 }, { "epoch": 2.372, "grad_norm": 5.987701383797447, "learning_rate": 6.382501263012936e-07, "logits/chosen": -0.39140084385871887, "logits/rejected": -1.0467764139175415, "logps/chosen": -0.48364967107772827, "logps/rejected": -1.6967146396636963, "loss": 0.4764, "odds_ratio_loss": 0.14926201105117798, "rewards/accuracies": 1.0, "rewards/chosen": -0.024182487279176712, "rewards/margins": 0.06065324693918228, "rewards/rejected": -0.0848357304930687, "sft_loss": 0.48364967107772827, "step": 2965 }, { "epoch": 2.376, "grad_norm": 5.365533722525087, "learning_rate": 6.305047737536707e-07, "logits/chosen": -1.085931658744812, "logits/rejected": -0.8201769590377808, "logps/chosen": -0.31420257687568665, "logps/rejected": -1.8399406671524048, "loss": 0.4935, "odds_ratio_loss": 0.16777533292770386, "rewards/accuracies": 1.0, "rewards/chosen": -0.015710126608610153, "rewards/margins": 0.07628689706325531, "rewards/rejected": -0.09199702739715576, "sft_loss": 0.31420257687568665, "step": 2970 }, { "epoch": 2.38, "grad_norm": 4.187858613296553, "learning_rate": 6.227999175462521e-07, "logits/chosen": -1.0166877508163452, "logits/rejected": -1.3475017547607422, "logps/chosen": -0.45567312836647034, "logps/rejected": -1.3064647912979126, "loss": 0.4065, "odds_ratio_loss": 0.20544198155403137, "rewards/accuracies": 1.0, "rewards/chosen": -0.022783655673265457, "rewards/margins": 0.0425395742058754, "rewards/rejected": -0.06532323360443115, "sft_loss": 0.45567312836647034, "step": 2975 }, { "epoch": 2.384, "grad_norm": 5.441360920553305, "learning_rate": 6.151357245788917e-07, "logits/chosen": -0.4649627208709717, "logits/rejected": -0.7259455323219299, "logps/chosen": -0.32298606634140015, "logps/rejected": -1.7776187658309937, "loss": 0.421, "odds_ratio_loss": 0.07744672149419785, "rewards/accuracies": 1.0, "rewards/chosen": -0.016149302944540977, "rewards/margins": 0.0727316364645958, "rewards/rejected": -0.08888094127178192, "sft_loss": 0.32298606634140015, "step": 2980 }, { "epoch": 2.388, "grad_norm": 6.473770551803507, "learning_rate": 6.075123608706093e-07, "logits/chosen": -0.9025441408157349, "logits/rejected": -0.8097003698348999, "logps/chosen": -0.30892783403396606, "logps/rejected": -1.4229497909545898, "loss": 0.4896, "odds_ratio_loss": 0.18760545551776886, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.015446392819285393, "rewards/margins": 0.05570109561085701, "rewards/rejected": -0.07114748656749725, "sft_loss": 0.30892783403396606, "step": 2985 }, { "epoch": 2.392, "grad_norm": 7.045700421865737, "learning_rate": 5.999299915559956e-07, "logits/chosen": -0.8020213842391968, "logits/rejected": -0.9755544662475586, "logps/chosen": -0.3655335307121277, "logps/rejected": -1.7111154794692993, "loss": 0.4826, "odds_ratio_loss": 0.1401543915271759, "rewards/accuracies": 1.0, "rewards/chosen": -0.018276674672961235, "rewards/margins": 0.06727910041809082, "rewards/rejected": -0.08555576950311661, "sft_loss": 0.3655335307121277, "step": 2990 }, { "epoch": 2.396, "grad_norm": 5.1559133299959345, "learning_rate": 5.923887808816373e-07, "logits/chosen": -0.7541839480400085, "logits/rejected": -0.8241696357727051, "logps/chosen": -0.32986879348754883, "logps/rejected": -1.7209409475326538, "loss": 0.4559, "odds_ratio_loss": 0.11603020131587982, "rewards/accuracies": 1.0, "rewards/chosen": -0.01649343967437744, "rewards/margins": 0.06955362856388092, "rewards/rejected": -0.08604706078767776, "sft_loss": 0.32986879348754883, "step": 2995 }, { "epoch": 2.4, "grad_norm": 5.5494435865172695, "learning_rate": 5.848888922025553e-07, "logits/chosen": -0.604158341884613, "logits/rejected": -1.010777473449707, "logps/chosen": -0.18639525771141052, "logps/rejected": -1.7561891078948975, "loss": 0.4748, "odds_ratio_loss": 0.08248431980609894, "rewards/accuracies": 1.0, "rewards/chosen": -0.009319763630628586, "rewards/margins": 0.07848969846963882, "rewards/rejected": -0.08780945837497711, "sft_loss": 0.18639525771141052, "step": 3000 }, { "epoch": 2.404, "grad_norm": 7.404224976766671, "learning_rate": 5.774304879786688e-07, "logits/chosen": -0.8347930908203125, "logits/rejected": -1.0326844453811646, "logps/chosen": -0.4041759967803955, "logps/rejected": -1.3478691577911377, "loss": 0.4676, "odds_ratio_loss": 0.1940927505493164, "rewards/accuracies": 1.0, "rewards/chosen": -0.020208800211548805, "rewards/margins": 0.04718465730547905, "rewards/rejected": -0.067393459379673, "sft_loss": 0.4041759967803955, "step": 3005 }, { "epoch": 2.408, "grad_norm": 6.3948612577090005, "learning_rate": 5.700137297712749e-07, "logits/chosen": -1.0781855583190918, "logits/rejected": -0.7031400799751282, "logps/chosen": -0.3962685465812683, "logps/rejected": -1.2765676975250244, "loss": 0.5032, "odds_ratio_loss": 0.2406892478466034, "rewards/accuracies": 1.0, "rewards/chosen": -0.019813427701592445, "rewards/margins": 0.04401496425271034, "rewards/rejected": -0.06382839381694794, "sft_loss": 0.3962685465812683, "step": 3010 }, { "epoch": 2.412, "grad_norm": 5.487144565387309, "learning_rate": 5.626387782395512e-07, "logits/chosen": -0.8737370371818542, "logits/rejected": -1.1139321327209473, "logps/chosen": -0.6051273345947266, "logps/rejected": -1.6024173498153687, "loss": 0.4723, "odds_ratio_loss": 0.23752835392951965, "rewards/accuracies": 1.0, "rewards/chosen": -0.0302563663572073, "rewards/margins": 0.0498645082116127, "rewards/rejected": -0.08012087643146515, "sft_loss": 0.6051273345947266, "step": 3015 }, { "epoch": 2.416, "grad_norm": 6.210678255713737, "learning_rate": 5.553057931370729e-07, "logits/chosen": -0.9348379373550415, "logits/rejected": -0.7690949440002441, "logps/chosen": -0.6480275392532349, "logps/rejected": -1.3571795225143433, "loss": 0.5108, "odds_ratio_loss": 0.2786480188369751, "rewards/accuracies": 1.0, "rewards/chosen": -0.032401375472545624, "rewards/margins": 0.03545759990811348, "rewards/rejected": -0.0678589791059494, "sft_loss": 0.6480275392532349, "step": 3020 }, { "epoch": 2.42, "grad_norm": 4.9274946658912615, "learning_rate": 5.48014933308352e-07, "logits/chosen": -0.7441031336784363, "logits/rejected": -1.0764060020446777, "logps/chosen": -0.38686519861221313, "logps/rejected": -1.4046424627304077, "loss": 0.5403, "odds_ratio_loss": 0.14413750171661377, "rewards/accuracies": 1.0, "rewards/chosen": -0.019343260675668716, "rewards/margins": 0.05088886618614197, "rewards/rejected": -0.07023213058710098, "sft_loss": 0.38686519861221313, "step": 3025 }, { "epoch": 2.424, "grad_norm": 4.697225271080028, "learning_rate": 5.407663566854008e-07, "logits/chosen": -0.9233636856079102, "logits/rejected": -0.7391080260276794, "logps/chosen": -0.3096584677696228, "logps/rejected": -1.552175760269165, "loss": 0.5123, "odds_ratio_loss": 0.2237163782119751, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.015482926741242409, "rewards/margins": 0.062125854194164276, "rewards/rejected": -0.07760877907276154, "sft_loss": 0.3096584677696228, "step": 3030 }, { "epoch": 2.428, "grad_norm": 5.079507350370886, "learning_rate": 5.335602202843054e-07, "logits/chosen": -0.7503083944320679, "logits/rejected": -0.8697785139083862, "logps/chosen": -0.5030153393745422, "logps/rejected": -1.383043885231018, "loss": 0.4996, "odds_ratio_loss": 0.20985543727874756, "rewards/accuracies": 1.0, "rewards/chosen": -0.025150766596198082, "rewards/margins": 0.04400142282247543, "rewards/rejected": -0.06915219128131866, "sft_loss": 0.5030153393745422, "step": 3035 }, { "epoch": 2.432, "grad_norm": 5.622928495063832, "learning_rate": 5.263966802018275e-07, "logits/chosen": -0.8630277514457703, "logits/rejected": -0.9782209396362305, "logps/chosen": -0.364883691072464, "logps/rejected": -3.0828795433044434, "loss": 0.3849, "odds_ratio_loss": 0.11438252776861191, "rewards/accuracies": 1.0, "rewards/chosen": -0.01824418641626835, "rewards/margins": 0.13589979708194733, "rewards/rejected": -0.15414398908615112, "sft_loss": 0.364883691072464, "step": 3040 }, { "epoch": 2.436, "grad_norm": 6.518609406323272, "learning_rate": 5.192758916120236e-07, "logits/chosen": -0.720903754234314, "logits/rejected": -0.9428263902664185, "logps/chosen": -0.44252529740333557, "logps/rejected": -1.551532506942749, "loss": 0.5285, "odds_ratio_loss": 0.19013534486293793, "rewards/accuracies": 1.0, "rewards/chosen": -0.022126266732811928, "rewards/margins": 0.05545036867260933, "rewards/rejected": -0.0775766372680664, "sft_loss": 0.44252529740333557, "step": 3045 }, { "epoch": 2.44, "grad_norm": 6.898029710003963, "learning_rate": 5.121980087628802e-07, "logits/chosen": -1.102452039718628, "logits/rejected": -1.1401493549346924, "logps/chosen": -0.6420519351959229, "logps/rejected": -1.6801929473876953, "loss": 0.491, "odds_ratio_loss": 0.22621390223503113, "rewards/accuracies": 1.0, "rewards/chosen": -0.03210259601473808, "rewards/margins": 0.05190705135464668, "rewards/rejected": -0.08400964736938477, "sft_loss": 0.6420519351959229, "step": 3050 }, { "epoch": 2.444, "grad_norm": 7.428665380081318, "learning_rate": 5.051631849729785e-07, "logits/chosen": -0.7611395120620728, "logits/rejected": -0.9640461802482605, "logps/chosen": -0.7123726606369019, "logps/rejected": -1.2486783266067505, "loss": 0.4901, "odds_ratio_loss": 0.40984076261520386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03561863675713539, "rewards/margins": 0.026815274730324745, "rewards/rejected": -0.06243390962481499, "sft_loss": 0.7123726606369019, "step": 3055 }, { "epoch": 2.448, "grad_norm": 4.15877124152584, "learning_rate": 4.981715726281666e-07, "logits/chosen": -0.8188239336013794, "logits/rejected": -1.3134756088256836, "logps/chosen": -0.42243289947509766, "logps/rejected": -1.7504783868789673, "loss": 0.4158, "odds_ratio_loss": 0.14152325689792633, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.021121647208929062, "rewards/margins": 0.06640227138996124, "rewards/rejected": -0.08752389997243881, "sft_loss": 0.42243289947509766, "step": 3060 }, { "epoch": 2.452, "grad_norm": 4.5394562483362435, "learning_rate": 4.912233231782623e-07, "logits/chosen": -0.844011127948761, "logits/rejected": -0.77781081199646, "logps/chosen": -0.5193901658058167, "logps/rejected": -1.5341647863388062, "loss": 0.5074, "odds_ratio_loss": 0.34647947549819946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02596951089799404, "rewards/margins": 0.050738729536533356, "rewards/rejected": -0.07670824229717255, "sft_loss": 0.5193901658058167, "step": 3065 }, { "epoch": 2.456, "grad_norm": 5.9484966146690885, "learning_rate": 4.843185871337722e-07, "logits/chosen": -1.1236478090286255, "logits/rejected": -1.187819242477417, "logps/chosen": -0.3840157687664032, "logps/rejected": -1.7428499460220337, "loss": 0.5018, "odds_ratio_loss": 0.13463526964187622, "rewards/accuracies": 1.0, "rewards/chosen": -0.01920078694820404, "rewards/margins": 0.06794170290231705, "rewards/rejected": -0.08714248239994049, "sft_loss": 0.3840157687664032, "step": 3070 }, { "epoch": 2.46, "grad_norm": 5.253460866623653, "learning_rate": 4.774575140626317e-07, "logits/chosen": -0.7584226131439209, "logits/rejected": -0.8733755350112915, "logps/chosen": -0.6464089155197144, "logps/rejected": -1.525433897972107, "loss": 0.4441, "odds_ratio_loss": 0.25305792689323425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03232044726610184, "rewards/margins": 0.04395125061273575, "rewards/rejected": -0.07627169787883759, "sft_loss": 0.6464089155197144, "step": 3075 }, { "epoch": 2.464, "grad_norm": 12.350993791450163, "learning_rate": 4.706402525869633e-07, "logits/chosen": -0.7759664058685303, "logits/rejected": -0.9143487811088562, "logps/chosen": -0.3908182978630066, "logps/rejected": -1.8124420642852783, "loss": 0.4794, "odds_ratio_loss": 0.12326017767190933, "rewards/accuracies": 1.0, "rewards/chosen": -0.01954091526567936, "rewards/margins": 0.07108117640018463, "rewards/rejected": -0.09062208980321884, "sft_loss": 0.3908182978630066, "step": 3080 }, { "epoch": 2.468, "grad_norm": 14.144313900187102, "learning_rate": 4.638669503798579e-07, "logits/chosen": -0.8764970898628235, "logits/rejected": -0.9682470560073853, "logps/chosen": -0.3992885947227478, "logps/rejected": -1.7859230041503906, "loss": 0.5238, "odds_ratio_loss": 0.14603550732135773, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0199644286185503, "rewards/margins": 0.06933172792196274, "rewards/rejected": -0.08929616957902908, "sft_loss": 0.3992885947227478, "step": 3085 }, { "epoch": 2.472, "grad_norm": 4.717707133780473, "learning_rate": 4.5713775416217884e-07, "logits/chosen": -0.7459074854850769, "logits/rejected": -0.9472630620002747, "logps/chosen": -0.3936541676521301, "logps/rejected": -1.3656264543533325, "loss": 0.4566, "odds_ratio_loss": 0.1686370074748993, "rewards/accuracies": 1.0, "rewards/chosen": -0.019682709127664566, "rewards/margins": 0.04859861359000206, "rewards/rejected": -0.06828131526708603, "sft_loss": 0.3936541676521301, "step": 3090 }, { "epoch": 2.476, "grad_norm": 6.069217496310175, "learning_rate": 4.5045280969937847e-07, "logits/chosen": -0.7254356145858765, "logits/rejected": -1.232452154159546, "logps/chosen": -0.5967345833778381, "logps/rejected": -1.7667179107666016, "loss": 0.4779, "odds_ratio_loss": 0.31498411297798157, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.029836729168891907, "rewards/margins": 0.05849916860461235, "rewards/rejected": -0.08833589404821396, "sft_loss": 0.5967345833778381, "step": 3095 }, { "epoch": 2.48, "grad_norm": 6.104980238131941, "learning_rate": 4.438122617983442e-07, "logits/chosen": -0.8972131609916687, "logits/rejected": -0.9765474200248718, "logps/chosen": -0.42376136779785156, "logps/rejected": -1.4339518547058105, "loss": 0.4595, "odds_ratio_loss": 0.17065298557281494, "rewards/accuracies": 1.0, "rewards/chosen": -0.02118806727230549, "rewards/margins": 0.05050952360033989, "rewards/rejected": -0.07169759273529053, "sft_loss": 0.42376136779785156, "step": 3100 }, { "epoch": 2.484, "grad_norm": 6.781146830921427, "learning_rate": 4.372162543042624e-07, "logits/chosen": -0.8027432560920715, "logits/rejected": -0.9535185694694519, "logps/chosen": -0.5751426219940186, "logps/rejected": -1.7298011779785156, "loss": 0.443, "odds_ratio_loss": 0.19305340945720673, "rewards/accuracies": 1.0, "rewards/chosen": -0.02875712886452675, "rewards/margins": 0.05773293972015381, "rewards/rejected": -0.08649007230997086, "sft_loss": 0.5751426219940186, "step": 3105 }, { "epoch": 2.488, "grad_norm": 5.981163589398859, "learning_rate": 4.3066493009749853e-07, "logits/chosen": -0.6171215772628784, "logits/rejected": -0.874431312084198, "logps/chosen": -0.6073184609413147, "logps/rejected": -1.8347069025039673, "loss": 0.3958, "odds_ratio_loss": 0.24824929237365723, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030365925282239914, "rewards/margins": 0.061369407922029495, "rewards/rejected": -0.09173533320426941, "sft_loss": 0.6073184609413147, "step": 3110 }, { "epoch": 2.492, "grad_norm": 5.642504715902896, "learning_rate": 4.2415843109050667e-07, "logits/chosen": -0.9500174522399902, "logits/rejected": -1.0051153898239136, "logps/chosen": -0.3405509889125824, "logps/rejected": -1.6603267192840576, "loss": 0.4716, "odds_ratio_loss": 0.1337331086397171, "rewards/accuracies": 1.0, "rewards/chosen": -0.01702754944562912, "rewards/margins": 0.06598879396915436, "rewards/rejected": -0.08301634341478348, "sft_loss": 0.3405509889125824, "step": 3115 }, { "epoch": 2.496, "grad_norm": 6.4696426005968775, "learning_rate": 4.1769689822475147e-07, "logits/chosen": -0.57563316822052, "logits/rejected": -0.9418606758117676, "logps/chosen": -0.5054479837417603, "logps/rejected": -1.5817244052886963, "loss": 0.4687, "odds_ratio_loss": 0.21483473479747772, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.025272399187088013, "rewards/margins": 0.05381382629275322, "rewards/rejected": -0.07908622920513153, "sft_loss": 0.5054479837417603, "step": 3120 }, { "epoch": 2.5, "grad_norm": 4.172914702601585, "learning_rate": 4.1128047146765936e-07, "logits/chosen": -0.7602171897888184, "logits/rejected": -1.0409475564956665, "logps/chosen": -0.40496939420700073, "logps/rejected": -1.4085716009140015, "loss": 0.4094, "odds_ratio_loss": 0.21070821583271027, "rewards/accuracies": 1.0, "rewards/chosen": -0.020248468965291977, "rewards/margins": 0.050180114805698395, "rewards/rejected": -0.07042858749628067, "sft_loss": 0.40496939420700073, "step": 3125 }, { "epoch": 2.504, "grad_norm": 5.627553812250054, "learning_rate": 4.049092898095816e-07, "logits/chosen": -0.8009653091430664, "logits/rejected": -1.1446640491485596, "logps/chosen": -0.302462637424469, "logps/rejected": -1.758111596107483, "loss": 0.5068, "odds_ratio_loss": 0.08966059982776642, "rewards/accuracies": 1.0, "rewards/chosen": -0.015123131684958935, "rewards/margins": 0.07278244942426682, "rewards/rejected": -0.08790557831525803, "sft_loss": 0.302462637424469, "step": 3130 }, { "epoch": 2.508, "grad_norm": 5.282423292390493, "learning_rate": 3.9858349126078945e-07, "logits/chosen": -1.0294119119644165, "logits/rejected": -0.672359049320221, "logps/chosen": -0.2604931592941284, "logps/rejected": -1.511343002319336, "loss": 0.4927, "odds_ratio_loss": 0.10487516969442368, "rewards/accuracies": 1.0, "rewards/chosen": -0.013024657964706421, "rewards/margins": 0.06254249066114426, "rewards/rejected": -0.07556714862585068, "sft_loss": 0.2604931592941284, "step": 3135 }, { "epoch": 2.512, "grad_norm": 5.2623944719501585, "learning_rate": 3.9230321284847856e-07, "logits/chosen": -0.9906686544418335, "logits/rejected": -0.7455857992172241, "logps/chosen": -0.599448561668396, "logps/rejected": -1.501896619796753, "loss": 0.4545, "odds_ratio_loss": 0.2790692448616028, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02997243031859398, "rewards/margins": 0.045122403651475906, "rewards/rejected": -0.07509483397006989, "sft_loss": 0.599448561668396, "step": 3140 }, { "epoch": 2.516, "grad_norm": 5.779281261694176, "learning_rate": 3.86068590613804e-07, "logits/chosen": -0.8702551126480103, "logits/rejected": -1.0743459463119507, "logps/chosen": -0.2949695885181427, "logps/rejected": -1.3731797933578491, "loss": 0.4727, "odds_ratio_loss": 0.13363251090049744, "rewards/accuracies": 1.0, "rewards/chosen": -0.01474847923964262, "rewards/margins": 0.0539105124771595, "rewards/rejected": -0.0686589926481247, "sft_loss": 0.2949695885181427, "step": 3145 }, { "epoch": 2.52, "grad_norm": 6.263065115062914, "learning_rate": 3.798797596089351e-07, "logits/chosen": -1.070436716079712, "logits/rejected": -1.0166070461273193, "logps/chosen": -0.2887745499610901, "logps/rejected": -1.5660569667816162, "loss": 0.4361, "odds_ratio_loss": 0.11832698434591293, "rewards/accuracies": 1.0, "rewards/chosen": -0.014438727870583534, "rewards/margins": 0.06386412680149078, "rewards/rejected": -0.07830285280942917, "sft_loss": 0.2887745499610901, "step": 3150 }, { "epoch": 2.524, "grad_norm": 6.821197735395411, "learning_rate": 3.737368538941255e-07, "logits/chosen": -0.9708150029182434, "logits/rejected": -0.8317630887031555, "logps/chosen": -0.41128820180892944, "logps/rejected": -1.2074782848358154, "loss": 0.4127, "odds_ratio_loss": 0.2423263043165207, "rewards/accuracies": 1.0, "rewards/chosen": -0.020564410835504532, "rewards/margins": 0.03980950638651848, "rewards/rejected": -0.06037392094731331, "sft_loss": 0.41128820180892944, "step": 3155 }, { "epoch": 2.528, "grad_norm": 5.542329936785543, "learning_rate": 3.6764000653481263e-07, "logits/chosen": -1.2484712600708008, "logits/rejected": -0.9088461995124817, "logps/chosen": -0.3338097929954529, "logps/rejected": -1.6267837285995483, "loss": 0.4552, "odds_ratio_loss": 0.12026141583919525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.016690488904714584, "rewards/margins": 0.06464870274066925, "rewards/rejected": -0.08133919537067413, "sft_loss": 0.3338097929954529, "step": 3160 }, { "epoch": 2.532, "grad_norm": 8.97897074618829, "learning_rate": 3.615893495987335e-07, "logits/chosen": -0.5620380640029907, "logits/rejected": -0.8675304651260376, "logps/chosen": -0.5898982882499695, "logps/rejected": -1.3201124668121338, "loss": 0.4837, "odds_ratio_loss": 0.3108993470668793, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.029494917020201683, "rewards/margins": 0.036510709673166275, "rewards/rejected": -0.0660056322813034, "sft_loss": 0.5898982882499695, "step": 3165 }, { "epoch": 2.536, "grad_norm": 4.681514042861988, "learning_rate": 3.555850141530659e-07, "logits/chosen": -1.0148943662643433, "logits/rejected": -0.982884407043457, "logps/chosen": -0.22993119060993195, "logps/rejected": -1.4043452739715576, "loss": 0.5221, "odds_ratio_loss": 0.08220822364091873, "rewards/accuracies": 1.0, "rewards/chosen": -0.011496557854115963, "rewards/margins": 0.058720700442790985, "rewards/rejected": -0.07021726667881012, "sft_loss": 0.22993119060993195, "step": 3170 }, { "epoch": 2.54, "grad_norm": 7.722013429404472, "learning_rate": 3.4962713026158697e-07, "logits/chosen": -0.5106195211410522, "logits/rejected": -1.0413014888763428, "logps/chosen": -0.6414362788200378, "logps/rejected": -1.3033192157745361, "loss": 0.4655, "odds_ratio_loss": 0.3088650107383728, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03207181394100189, "rewards/margins": 0.0330941416323185, "rewards/rejected": -0.06516595929861069, "sft_loss": 0.6414362788200378, "step": 3175 }, { "epoch": 2.544, "grad_norm": 4.914705494262134, "learning_rate": 3.4371582698185636e-07, "logits/chosen": -1.18616783618927, "logits/rejected": -0.963121771812439, "logps/chosen": -0.31630367040634155, "logps/rejected": -1.29836106300354, "loss": 0.5218, "odds_ratio_loss": 0.20544719696044922, "rewards/accuracies": 1.0, "rewards/chosen": -0.015815183520317078, "rewards/margins": 0.049102868884801865, "rewards/rejected": -0.06491805613040924, "sft_loss": 0.31630367040634155, "step": 3180 }, { "epoch": 2.548, "grad_norm": 10.854835141924323, "learning_rate": 3.378512323624228e-07, "logits/chosen": -0.5074478387832642, "logits/rejected": -1.2903783321380615, "logps/chosen": -0.7434813976287842, "logps/rejected": -1.6036231517791748, "loss": 0.5155, "odds_ratio_loss": 0.265799343585968, "rewards/accuracies": 1.0, "rewards/chosen": -0.03717407211661339, "rewards/margins": 0.04300709441304207, "rewards/rejected": -0.08018116652965546, "sft_loss": 0.7434813976287842, "step": 3185 }, { "epoch": 2.552, "grad_norm": 5.764157322717205, "learning_rate": 3.3203347344004737e-07, "logits/chosen": -0.7570070028305054, "logits/rejected": -1.1250250339508057, "logps/chosen": -0.6759325265884399, "logps/rejected": -1.439347267150879, "loss": 0.5504, "odds_ratio_loss": 0.3499813973903656, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.033796630799770355, "rewards/margins": 0.03817072883248329, "rewards/rejected": -0.07196736335754395, "sft_loss": 0.6759325265884399, "step": 3190 }, { "epoch": 2.556, "grad_norm": 4.888945884008844, "learning_rate": 3.262626762369525e-07, "logits/chosen": -1.1923449039459229, "logits/rejected": -0.8670722842216492, "logps/chosen": -0.31347471475601196, "logps/rejected": -1.4067668914794922, "loss": 0.4052, "odds_ratio_loss": 0.12347825616598129, "rewards/accuracies": 1.0, "rewards/chosen": -0.01567373424768448, "rewards/margins": 0.05466460436582565, "rewards/rejected": -0.07033834606409073, "sft_loss": 0.31347471475601196, "step": 3195 }, { "epoch": 2.56, "grad_norm": 4.380022841165363, "learning_rate": 3.2053896575809426e-07, "logits/chosen": -0.5602080821990967, "logits/rejected": -1.0972874164581299, "logps/chosen": -0.7473759651184082, "logps/rejected": -1.6633918285369873, "loss": 0.4408, "odds_ratio_loss": 0.2667500674724579, "rewards/accuracies": 1.0, "rewards/chosen": -0.03736879676580429, "rewards/margins": 0.04580079764127731, "rewards/rejected": -0.0831695944070816, "sft_loss": 0.7473759651184082, "step": 3200 }, { "epoch": 2.564, "grad_norm": 5.674973517980157, "learning_rate": 3.148624659884508e-07, "logits/chosen": -0.7617183327674866, "logits/rejected": -0.972303569316864, "logps/chosen": -0.4322594702243805, "logps/rejected": -1.4709136486053467, "loss": 0.4488, "odds_ratio_loss": 0.186048686504364, "rewards/accuracies": 1.0, "rewards/chosen": -0.021612973883748055, "rewards/margins": 0.05193271115422249, "rewards/rejected": -0.0735456794500351, "sft_loss": 0.4322594702243805, "step": 3205 }, { "epoch": 2.568, "grad_norm": 4.977829670896469, "learning_rate": 3.092332998903416e-07, "logits/chosen": -0.8257712125778198, "logits/rejected": -0.8016840219497681, "logps/chosen": -0.4363314211368561, "logps/rejected": -1.1737916469573975, "loss": 0.4606, "odds_ratio_loss": 0.22321203351020813, "rewards/accuracies": 1.0, "rewards/chosen": -0.021816570311784744, "rewards/margins": 0.03687301650643349, "rewards/rejected": -0.05868958681821823, "sft_loss": 0.4363314211368561, "step": 3210 }, { "epoch": 2.572, "grad_norm": 5.0240652358480755, "learning_rate": 3.0365158940075664e-07, "logits/chosen": -1.0381290912628174, "logits/rejected": -1.1659702062606812, "logps/chosen": -0.5309845209121704, "logps/rejected": -1.4535753726959229, "loss": 0.6035, "odds_ratio_loss": 0.28817451000213623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02654922381043434, "rewards/margins": 0.04612954333424568, "rewards/rejected": -0.07267877459526062, "sft_loss": 0.5309845209121704, "step": 3215 }, { "epoch": 2.576, "grad_norm": 5.408869231167505, "learning_rate": 2.981174554287239e-07, "logits/chosen": -1.015199899673462, "logits/rejected": -1.3747820854187012, "logps/chosen": -0.4069809317588806, "logps/rejected": -1.5189340114593506, "loss": 0.4696, "odds_ratio_loss": 0.16449251770973206, "rewards/accuracies": 1.0, "rewards/chosen": -0.02034904807806015, "rewards/margins": 0.05559765547513962, "rewards/rejected": -0.07594670355319977, "sft_loss": 0.4069809317588806, "step": 3220 }, { "epoch": 2.58, "grad_norm": 7.760925759203226, "learning_rate": 2.9263101785268253e-07, "logits/chosen": -0.5573514699935913, "logits/rejected": -1.252190113067627, "logps/chosen": -0.41566723585128784, "logps/rejected": -1.4632532596588135, "loss": 0.4529, "odds_ratio_loss": 0.16219457983970642, "rewards/accuracies": 1.0, "rewards/chosen": -0.02078336291015148, "rewards/margins": 0.0523792989552021, "rewards/rejected": -0.07316266000270844, "sft_loss": 0.41566723585128784, "step": 3225 }, { "epoch": 2.584, "grad_norm": 6.9168438538457995, "learning_rate": 2.871923955178918e-07, "logits/chosen": -0.7818273901939392, "logits/rejected": -0.9009189605712891, "logps/chosen": -0.5484607219696045, "logps/rejected": -1.4074246883392334, "loss": 0.4963, "odds_ratio_loss": 0.2880396842956543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.027423039078712463, "rewards/margins": 0.042948197573423386, "rewards/rejected": -0.07037124037742615, "sft_loss": 0.5484607219696045, "step": 3230 }, { "epoch": 2.588, "grad_norm": 5.722693546238487, "learning_rate": 2.8180170623385213e-07, "logits/chosen": -0.9606497883796692, "logits/rejected": -0.9924991726875305, "logps/chosen": -0.4993307590484619, "logps/rejected": -2.206209659576416, "loss": 0.4349, "odds_ratio_loss": 0.2471529245376587, "rewards/accuracies": 1.0, "rewards/chosen": -0.024966537952423096, "rewards/margins": 0.08534395694732666, "rewards/rejected": -0.11031049489974976, "sft_loss": 0.4993307590484619, "step": 3235 }, { "epoch": 2.592, "grad_norm": 5.892839113416792, "learning_rate": 2.764590667717562e-07, "logits/chosen": -0.47418349981307983, "logits/rejected": -1.210965871810913, "logps/chosen": -0.5353994965553284, "logps/rejected": -1.8738822937011719, "loss": 0.4341, "odds_ratio_loss": 0.2881673276424408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02676997520029545, "rewards/margins": 0.06692413985729218, "rewards/rejected": -0.09369411319494247, "sft_loss": 0.5353994965553284, "step": 3240 }, { "epoch": 2.596, "grad_norm": 4.975415723996069, "learning_rate": 2.7116459286195887e-07, "logits/chosen": -1.0550581216812134, "logits/rejected": -1.0994212627410889, "logps/chosen": -0.29386892914772034, "logps/rejected": -1.3806931972503662, "loss": 0.4509, "odds_ratio_loss": 0.1265828013420105, "rewards/accuracies": 1.0, "rewards/chosen": -0.014693446457386017, "rewards/margins": 0.054341208189725876, "rewards/rejected": -0.0690346509218216, "sft_loss": 0.29386892914772034, "step": 3245 }, { "epoch": 2.6, "grad_norm": 4.814970729318306, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -0.7558291554450989, "logits/rejected": -0.8831847906112671, "logps/chosen": -0.4392101764678955, "logps/rejected": -1.304246187210083, "loss": 0.494, "odds_ratio_loss": 0.24885766208171844, "rewards/accuracies": 1.0, "rewards/chosen": -0.021960508078336716, "rewards/margins": 0.04325180500745773, "rewards/rejected": -0.06521230936050415, "sft_loss": 0.4392101764678955, "step": 3250 }, { "epoch": 2.604, "grad_norm": 8.307336904912425, "learning_rate": 2.6072059940146775e-07, "logits/chosen": -1.1570188999176025, "logits/rejected": -0.8211970329284668, "logps/chosen": -0.4389687180519104, "logps/rejected": -1.5756551027297974, "loss": 0.4346, "odds_ratio_loss": 0.1567719727754593, "rewards/accuracies": 1.0, "rewards/chosen": -0.0219484381377697, "rewards/margins": 0.05683432146906853, "rewards/rejected": -0.07878275215625763, "sft_loss": 0.4389687180519104, "step": 3255 }, { "epoch": 2.608, "grad_norm": 5.842083846009801, "learning_rate": 2.555713060848433e-07, "logits/chosen": -0.9055282473564148, "logits/rejected": -1.0206440687179565, "logps/chosen": -0.6034643650054932, "logps/rejected": -1.3517249822616577, "loss": 0.4756, "odds_ratio_loss": 0.37321245670318604, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030173221603035927, "rewards/margins": 0.037413034588098526, "rewards/rejected": -0.067586250603199, "sft_loss": 0.6034643650054932, "step": 3260 }, { "epoch": 2.612, "grad_norm": 26.57275268189309, "learning_rate": 2.504706307837551e-07, "logits/chosen": -0.9575685262680054, "logits/rejected": -1.1905924081802368, "logps/chosen": -0.43124690651893616, "logps/rejected": -1.5143071413040161, "loss": 0.4799, "odds_ratio_loss": 0.24718472361564636, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.021562347188591957, "rewards/margins": 0.054153017699718475, "rewards/rejected": -0.07571535557508469, "sft_loss": 0.43124690651893616, "step": 3265 }, { "epoch": 2.616, "grad_norm": 12.548046244131415, "learning_rate": 2.454186839872158e-07, "logits/chosen": -0.8983935117721558, "logits/rejected": -0.9400730133056641, "logps/chosen": -0.5530645251274109, "logps/rejected": -1.652138113975525, "loss": 0.4669, "odds_ratio_loss": 0.21761877834796906, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.027653228491544724, "rewards/margins": 0.054953683167696, "rewards/rejected": -0.08260690420866013, "sft_loss": 0.5530645251274109, "step": 3270 }, { "epoch": 2.62, "grad_norm": 5.389650137843442, "learning_rate": 2.404155751286988e-07, "logits/chosen": -1.0610014200210571, "logits/rejected": -1.133551836013794, "logps/chosen": -0.39243531227111816, "logps/rejected": -1.79599130153656, "loss": 0.4603, "odds_ratio_loss": 0.2275979220867157, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.01962176337838173, "rewards/margins": 0.07017780840396881, "rewards/rejected": -0.08979956060647964, "sft_loss": 0.39243531227111816, "step": 3275 }, { "epoch": 2.624, "grad_norm": 5.664088641927079, "learning_rate": 2.3546141258376786e-07, "logits/chosen": -0.7254642248153687, "logits/rejected": -0.6703656911849976, "logps/chosen": -0.3163744807243347, "logps/rejected": -1.2279107570648193, "loss": 0.5009, "odds_ratio_loss": 0.15358732640743256, "rewards/accuracies": 1.0, "rewards/chosen": -0.015818724408745766, "rewards/margins": 0.04557682201266289, "rewards/rejected": -0.0613955482840538, "sft_loss": 0.3163744807243347, "step": 3280 }, { "epoch": 2.628, "grad_norm": 4.6344681245327495, "learning_rate": 2.3055630366772857e-07, "logits/chosen": -0.7210429906845093, "logits/rejected": -0.7038585543632507, "logps/chosen": -0.45490559935569763, "logps/rejected": -1.3154146671295166, "loss": 0.5243, "odds_ratio_loss": 0.2467753142118454, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.022745277732610703, "rewards/margins": 0.04302545264363289, "rewards/rejected": -0.06577073037624359, "sft_loss": 0.45490559935569763, "step": 3285 }, { "epoch": 2.632, "grad_norm": 4.8837015178641705, "learning_rate": 2.257003546333042e-07, "logits/chosen": -0.9644176363945007, "logits/rejected": -1.0574061870574951, "logps/chosen": -0.4861271381378174, "logps/rejected": -1.230362892150879, "loss": 0.506, "odds_ratio_loss": 0.223617285490036, "rewards/accuracies": 1.0, "rewards/chosen": -0.02430635876953602, "rewards/margins": 0.03721178323030472, "rewards/rejected": -0.061518143862485886, "sft_loss": 0.4861271381378174, "step": 3290 }, { "epoch": 2.636, "grad_norm": 4.840338076295951, "learning_rate": 2.208936706683351e-07, "logits/chosen": -0.747097909450531, "logits/rejected": -0.7051985859870911, "logps/chosen": -0.5597037076950073, "logps/rejected": -1.4610236883163452, "loss": 0.4651, "odds_ratio_loss": 0.3216997981071472, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.027985185384750366, "rewards/margins": 0.045065999031066895, "rewards/rejected": -0.07305117696523666, "sft_loss": 0.5597037076950073, "step": 3295 }, { "epoch": 2.64, "grad_norm": 7.554990598127512, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -0.8587006330490112, "logits/rejected": -0.9386578798294067, "logps/chosen": -0.4001200199127197, "logps/rejected": -1.5990378856658936, "loss": 0.4853, "odds_ratio_loss": 0.2407374083995819, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.020006002858281136, "rewards/margins": 0.05994589254260063, "rewards/rejected": -0.07995189726352692, "sft_loss": 0.4001200199127197, "step": 3300 }, { "epoch": 2.644, "grad_norm": 9.49274069461471, "learning_rate": 2.1142851336005244e-07, "logits/chosen": -1.1812444925308228, "logits/rejected": -1.0167138576507568, "logps/chosen": -0.39482712745666504, "logps/rejected": -1.4087440967559814, "loss": 0.4741, "odds_ratio_loss": 0.14712780714035034, "rewards/accuracies": 1.0, "rewards/chosen": -0.019741356372833252, "rewards/margins": 0.05069585517048836, "rewards/rejected": -0.07043720781803131, "sft_loss": 0.39482712745666504, "step": 3305 }, { "epoch": 2.648, "grad_norm": 6.995561448070307, "learning_rate": 2.0677024504760752e-07, "logits/chosen": -1.0624709129333496, "logits/rejected": -1.4392796754837036, "logps/chosen": -0.5113095641136169, "logps/rejected": -1.4682908058166504, "loss": 0.4876, "odds_ratio_loss": 0.37565088272094727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.025565480813384056, "rewards/margins": 0.04784905165433884, "rewards/rejected": -0.07341454178094864, "sft_loss": 0.5113095641136169, "step": 3310 }, { "epoch": 2.652, "grad_norm": 7.478983570395769, "learning_rate": 2.0216165186191406e-07, "logits/chosen": -0.7986949682235718, "logits/rejected": -1.3443512916564941, "logps/chosen": -0.6050113439559937, "logps/rejected": -1.6553246974945068, "loss": 0.4864, "odds_ratio_loss": 0.1889444887638092, "rewards/accuracies": 1.0, "rewards/chosen": -0.030250567942857742, "rewards/margins": 0.052515674382448196, "rewards/rejected": -0.08276623487472534, "sft_loss": 0.6050113439559937, "step": 3315 }, { "epoch": 2.656, "grad_norm": 5.628793749435426, "learning_rate": 1.9760283363267684e-07, "logits/chosen": -0.96771240234375, "logits/rejected": -0.6631850004196167, "logps/chosen": -0.5663673877716064, "logps/rejected": -1.225140929222107, "loss": 0.5049, "odds_ratio_loss": 0.3630017340183258, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.028318369761109352, "rewards/margins": 0.03293868154287338, "rewards/rejected": -0.06125704571604729, "sft_loss": 0.5663673877716064, "step": 3320 }, { "epoch": 2.66, "grad_norm": 4.202178749975983, "learning_rate": 1.9309388911139427e-07, "logits/chosen": -0.8587250709533691, "logits/rejected": -1.0437920093536377, "logps/chosen": -0.5410333871841431, "logps/rejected": -1.3033370971679688, "loss": 0.4574, "odds_ratio_loss": 0.25340771675109863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.027051672339439392, "rewards/margins": 0.038115184754133224, "rewards/rejected": -0.06516685336828232, "sft_loss": 0.5410333871841431, "step": 3325 }, { "epoch": 2.664, "grad_norm": 4.996160979584086, "learning_rate": 1.8863491596921745e-07, "logits/chosen": -0.9663417935371399, "logits/rejected": -1.0380603075027466, "logps/chosen": -0.42533501982688904, "logps/rejected": -1.5897793769836426, "loss": 0.4536, "odds_ratio_loss": 0.1942438781261444, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0212667528539896, "rewards/margins": 0.058222223073244095, "rewards/rejected": -0.07948897033929825, "sft_loss": 0.42533501982688904, "step": 3330 }, { "epoch": 2.668, "grad_norm": 5.226973499413544, "learning_rate": 1.8422601079483516e-07, "logits/chosen": -0.8443425297737122, "logits/rejected": -1.0523451566696167, "logps/chosen": -0.6304419636726379, "logps/rejected": -1.6065928936004639, "loss": 0.4965, "odds_ratio_loss": 0.25557130575180054, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03152209892868996, "rewards/margins": 0.048807550221681595, "rewards/rejected": -0.08032964915037155, "sft_loss": 0.6304419636726379, "step": 3335 }, { "epoch": 2.672, "grad_norm": 5.151887544991019, "learning_rate": 1.798672690923828e-07, "logits/chosen": -0.4008614122867584, "logits/rejected": -1.028642177581787, "logps/chosen": -0.4695549011230469, "logps/rejected": -1.3987529277801514, "loss": 0.4852, "odds_ratio_loss": 0.2155243456363678, "rewards/accuracies": 1.0, "rewards/chosen": -0.023477744311094284, "rewards/margins": 0.046459902077913284, "rewards/rejected": -0.06993765383958817, "sft_loss": 0.4695549011230469, "step": 3340 }, { "epoch": 2.676, "grad_norm": 4.960271872554221, "learning_rate": 1.7555878527937164e-07, "logits/chosen": -1.132819414138794, "logits/rejected": -1.2651735544204712, "logps/chosen": -0.2899089753627777, "logps/rejected": -1.4967478513717651, "loss": 0.4768, "odds_ratio_loss": 0.12462642043828964, "rewards/accuracies": 1.0, "rewards/chosen": -0.014495449140667915, "rewards/margins": 0.06034195423126221, "rewards/rejected": -0.07483740150928497, "sft_loss": 0.2899089753627777, "step": 3345 }, { "epoch": 2.68, "grad_norm": 5.252700869191946, "learning_rate": 1.713006526846439e-07, "logits/chosen": -1.0679802894592285, "logits/rejected": -1.2805407047271729, "logps/chosen": -0.6222777366638184, "logps/rejected": -1.4826539754867554, "loss": 0.4673, "odds_ratio_loss": 0.23923306167125702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.031113887205719948, "rewards/margins": 0.04301881417632103, "rewards/rejected": -0.07413269579410553, "sft_loss": 0.6222777366638184, "step": 3350 }, { "epoch": 2.684, "grad_norm": 6.088871700564241, "learning_rate": 1.6709296354635335e-07, "logits/chosen": -0.7418292760848999, "logits/rejected": -1.1808269023895264, "logps/chosen": -0.7619951963424683, "logps/rejected": -1.662157654762268, "loss": 0.514, "odds_ratio_loss": 0.3287936747074127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.038099758327007294, "rewards/margins": 0.04500812292098999, "rewards/rejected": -0.08310787379741669, "sft_loss": 0.7619951963424683, "step": 3355 }, { "epoch": 2.6879999999999997, "grad_norm": 4.838274349890116, "learning_rate": 1.629358090099639e-07, "logits/chosen": -1.0373485088348389, "logits/rejected": -1.0439493656158447, "logps/chosen": -0.42322856187820435, "logps/rejected": -1.5781844854354858, "loss": 0.4928, "odds_ratio_loss": 0.1596982479095459, "rewards/accuracies": 1.0, "rewards/chosen": -0.021161429584026337, "rewards/margins": 0.05774780362844467, "rewards/rejected": -0.07890923321247101, "sft_loss": 0.42322856187820435, "step": 3360 }, { "epoch": 2.692, "grad_norm": 4.434071150396496, "learning_rate": 1.5882927912627772e-07, "logits/chosen": -0.8480218648910522, "logits/rejected": -0.9117224812507629, "logps/chosen": -0.4945312440395355, "logps/rejected": -1.384734034538269, "loss": 0.4637, "odds_ratio_loss": 0.23109391331672668, "rewards/accuracies": 1.0, "rewards/chosen": -0.024726565927267075, "rewards/margins": 0.0445101372897625, "rewards/rejected": -0.06923670321702957, "sft_loss": 0.4945312440395355, "step": 3365 }, { "epoch": 2.6959999999999997, "grad_norm": 5.347217671653992, "learning_rate": 1.5477346284948292e-07, "logits/chosen": -1.1148836612701416, "logits/rejected": -1.0766358375549316, "logps/chosen": -0.2054683268070221, "logps/rejected": -1.9612598419189453, "loss": 0.352, "odds_ratio_loss": 0.04080657660961151, "rewards/accuracies": 1.0, "rewards/chosen": -0.010273417457938194, "rewards/margins": 0.08778958767652512, "rewards/rejected": -0.09806300699710846, "sft_loss": 0.2054683268070221, "step": 3370 }, { "epoch": 2.7, "grad_norm": 7.112953927848122, "learning_rate": 1.507684480352292e-07, "logits/chosen": -0.86543208360672, "logits/rejected": -1.3143550157546997, "logps/chosen": -0.7909175157546997, "logps/rejected": -1.7691437005996704, "loss": 0.4695, "odds_ratio_loss": 0.23235614597797394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03954587131738663, "rewards/margins": 0.04891129583120346, "rewards/rejected": -0.08845716714859009, "sft_loss": 0.7909175157546997, "step": 3375 }, { "epoch": 2.7039999999999997, "grad_norm": 7.9988782226086155, "learning_rate": 1.4681432143872133e-07, "logits/chosen": -0.6214197874069214, "logits/rejected": -1.0236066579818726, "logps/chosen": -0.5888375639915466, "logps/rejected": -1.7203214168548584, "loss": 0.4666, "odds_ratio_loss": 0.1753186732530594, "rewards/accuracies": 1.0, "rewards/chosen": -0.02944187819957733, "rewards/margins": 0.056574203073978424, "rewards/rejected": -0.08601607382297516, "sft_loss": 0.5888375639915466, "step": 3380 }, { "epoch": 2.708, "grad_norm": 3.7336969896252983, "learning_rate": 1.4291116871284205e-07, "logits/chosen": -1.00608229637146, "logits/rejected": -1.003975510597229, "logps/chosen": -0.43859434127807617, "logps/rejected": -1.8750841617584229, "loss": 0.4176, "odds_ratio_loss": 0.21985983848571777, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02192971669137478, "rewards/margins": 0.07182449102401733, "rewards/rejected": -0.09375420212745667, "sft_loss": 0.43859434127807617, "step": 3385 }, { "epoch": 2.7119999999999997, "grad_norm": 9.218065040509618, "learning_rate": 1.3905907440629752e-07, "logits/chosen": -0.9918481111526489, "logits/rejected": -1.3855509757995605, "logps/chosen": -0.3834057152271271, "logps/rejected": -1.7054436206817627, "loss": 0.4777, "odds_ratio_loss": 0.16741231083869934, "rewards/accuracies": 1.0, "rewards/chosen": -0.019170286133885384, "rewards/margins": 0.06610190868377686, "rewards/rejected": -0.08527218550443649, "sft_loss": 0.3834057152271271, "step": 3390 }, { "epoch": 2.716, "grad_norm": 4.274440200102076, "learning_rate": 1.352581219617824e-07, "logits/chosen": -0.6903150677680969, "logits/rejected": -1.1200445890426636, "logps/chosen": -0.3996601402759552, "logps/rejected": -1.8368583917617798, "loss": 0.4716, "odds_ratio_loss": 0.13882999122142792, "rewards/accuracies": 1.0, "rewards/chosen": -0.01998300477862358, "rewards/margins": 0.07185991108417511, "rewards/rejected": -0.09184291958808899, "sft_loss": 0.3996601402759552, "step": 3395 }, { "epoch": 2.7199999999999998, "grad_norm": 4.630834348255805, "learning_rate": 1.31508393714177e-07, "logits/chosen": -1.1776244640350342, "logits/rejected": -0.7361685037612915, "logps/chosen": -0.2189209908246994, "logps/rejected": -1.2366139888763428, "loss": 0.3637, "odds_ratio_loss": 0.09817551076412201, "rewards/accuracies": 1.0, "rewards/chosen": -0.010946051217615604, "rewards/margins": 0.050884656608104706, "rewards/rejected": -0.061830706894397736, "sft_loss": 0.2189209908246994, "step": 3400 }, { "epoch": 2.724, "grad_norm": 5.857849838216993, "learning_rate": 1.278099708887587e-07, "logits/chosen": -0.8234980702400208, "logits/rejected": -0.843535304069519, "logps/chosen": -0.36947911977767944, "logps/rejected": -1.4612740278244019, "loss": 0.475, "odds_ratio_loss": 0.1974354237318039, "rewards/accuracies": 1.0, "rewards/chosen": -0.018473956733942032, "rewards/margins": 0.054589755833148956, "rewards/rejected": -0.07306370884180069, "sft_loss": 0.36947911977767944, "step": 3405 }, { "epoch": 2.7279999999999998, "grad_norm": 6.068229298253824, "learning_rate": 1.241629335994471e-07, "logits/chosen": -0.7435353994369507, "logits/rejected": -1.0612701177597046, "logps/chosen": -0.5406766533851624, "logps/rejected": -1.4909696578979492, "loss": 0.4116, "odds_ratio_loss": 0.30309200286865234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.027033831924200058, "rewards/margins": 0.04751463979482651, "rewards/rejected": -0.07454848289489746, "sft_loss": 0.5406766533851624, "step": 3410 }, { "epoch": 2.732, "grad_norm": 8.977086173159117, "learning_rate": 1.2056736084706588e-07, "logits/chosen": -0.9622576832771301, "logits/rejected": -1.2189757823944092, "logps/chosen": -0.43772053718566895, "logps/rejected": -1.773707628250122, "loss": 0.4551, "odds_ratio_loss": 0.18512067198753357, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.021886030212044716, "rewards/margins": 0.0667993575334549, "rewards/rejected": -0.08868537843227386, "sft_loss": 0.43772053718566895, "step": 3415 }, { "epoch": 2.7359999999999998, "grad_norm": 5.513247634423349, "learning_rate": 1.1702333051763271e-07, "logits/chosen": -0.6844684481620789, "logits/rejected": -1.1717528104782104, "logps/chosen": -0.5392543077468872, "logps/rejected": -1.2498254776000977, "loss": 0.4714, "odds_ratio_loss": 0.30782368779182434, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02696271799504757, "rewards/margins": 0.03552855923771858, "rewards/rejected": -0.062491275370121, "sft_loss": 0.5392543077468872, "step": 3420 }, { "epoch": 2.74, "grad_norm": 3.9149151857473714, "learning_rate": 1.1353091938067024e-07, "logits/chosen": -0.6818164587020874, "logits/rejected": -0.7006502151489258, "logps/chosen": -0.4631117880344391, "logps/rejected": -1.6541814804077148, "loss": 0.434, "odds_ratio_loss": 0.186067134141922, "rewards/accuracies": 1.0, "rewards/chosen": -0.023155588656663895, "rewards/margins": 0.059553492814302444, "rewards/rejected": -0.08270907402038574, "sft_loss": 0.4631117880344391, "step": 3425 }, { "epoch": 2.7439999999999998, "grad_norm": 5.1831375438532135, "learning_rate": 1.1009020308754587e-07, "logits/chosen": -0.7273832559585571, "logits/rejected": -0.6493693590164185, "logps/chosen": -0.17281028628349304, "logps/rejected": -1.4289897680282593, "loss": 0.4975, "odds_ratio_loss": 0.1179138645529747, "rewards/accuracies": 1.0, "rewards/chosen": -0.008640513755381107, "rewards/margins": 0.06280897557735443, "rewards/rejected": -0.07144948095083237, "sft_loss": 0.17281028628349304, "step": 3430 }, { "epoch": 2.748, "grad_norm": 4.973491478841313, "learning_rate": 1.067012561698319e-07, "logits/chosen": -1.207350730895996, "logits/rejected": -0.8244549632072449, "logps/chosen": -0.5802727937698364, "logps/rejected": -1.568420648574829, "loss": 0.4946, "odds_ratio_loss": 0.22998332977294922, "rewards/accuracies": 1.0, "rewards/chosen": -0.029013637453317642, "rewards/margins": 0.04940740019083023, "rewards/rejected": -0.07842103391885757, "sft_loss": 0.5802727937698364, "step": 3435 }, { "epoch": 2.752, "grad_norm": 7.538216183918059, "learning_rate": 1.0336415203768962e-07, "logits/chosen": -0.7276453971862793, "logits/rejected": -1.1800649166107178, "logps/chosen": -0.268561452627182, "logps/rejected": -1.8346469402313232, "loss": 0.4621, "odds_ratio_loss": 0.10011246055364609, "rewards/accuracies": 1.0, "rewards/chosen": -0.013428074307739735, "rewards/margins": 0.07830427587032318, "rewards/rejected": -0.09173235297203064, "sft_loss": 0.268561452627182, "step": 3440 }, { "epoch": 2.7560000000000002, "grad_norm": 4.385995959035907, "learning_rate": 1.0007896297828113e-07, "logits/chosen": -0.9866234660148621, "logits/rejected": -0.8177453875541687, "logps/chosen": -0.2839185297489166, "logps/rejected": -1.2978515625, "loss": 0.4541, "odds_ratio_loss": 0.13113179802894592, "rewards/accuracies": 1.0, "rewards/chosen": -0.014195924624800682, "rewards/margins": 0.05069665238261223, "rewards/rejected": -0.06489257514476776, "sft_loss": 0.2839185297489166, "step": 3445 }, { "epoch": 2.76, "grad_norm": 5.040991958957209, "learning_rate": 9.684576015420277e-08, "logits/chosen": -1.0320391654968262, "logits/rejected": -0.9190491437911987, "logps/chosen": -0.49723321199417114, "logps/rejected": -2.3937244415283203, "loss": 0.4407, "odds_ratio_loss": 0.21446876227855682, "rewards/accuracies": 1.0, "rewards/chosen": -0.024861661717295647, "rewards/margins": 0.09482455253601074, "rewards/rejected": -0.11968620866537094, "sft_loss": 0.49723321199417114, "step": 3450 }, { "epoch": 2.7640000000000002, "grad_norm": 6.834196455601754, "learning_rate": 9.36646136019434e-08, "logits/chosen": -1.0055029392242432, "logits/rejected": -0.663110077381134, "logps/chosen": -0.2397165298461914, "logps/rejected": -1.2084038257598877, "loss": 0.4345, "odds_ratio_loss": 0.11389895528554916, "rewards/accuracies": 1.0, "rewards/chosen": -0.011985826306045055, "rewards/margins": 0.04843435436487198, "rewards/rejected": -0.06042018532752991, "sft_loss": 0.2397165298461914, "step": 3455 }, { "epoch": 2.768, "grad_norm": 6.622662942386174, "learning_rate": 9.053559223036746e-08, "logits/chosen": -0.5032299757003784, "logits/rejected": -0.8415955305099487, "logps/chosen": -0.7557247877120972, "logps/rejected": -1.9935903549194336, "loss": 0.4693, "odds_ratio_loss": 0.29048803448677063, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03778623789548874, "rewards/margins": 0.0618932731449604, "rewards/rejected": -0.09967950731515884, "sft_loss": 0.7557247877120972, "step": 3460 }, { "epoch": 2.7720000000000002, "grad_norm": 5.779744351491692, "learning_rate": 8.745876381922147e-08, "logits/chosen": -0.9570730924606323, "logits/rejected": -1.0343133211135864, "logps/chosen": -0.3262855112552643, "logps/rejected": -1.9049714803695679, "loss": 0.4068, "odds_ratio_loss": 0.1138528436422348, "rewards/accuracies": 1.0, "rewards/chosen": -0.016314277425408363, "rewards/margins": 0.07893429696559906, "rewards/rejected": -0.09524857252836227, "sft_loss": 0.3262855112552643, "step": 3465 }, { "epoch": 2.776, "grad_norm": 6.075626717992663, "learning_rate": 8.44341950176683e-08, "logits/chosen": -0.8139813542366028, "logits/rejected": -1.261385202407837, "logps/chosen": -0.5494598150253296, "logps/rejected": -1.436342477798462, "loss": 0.474, "odds_ratio_loss": 0.2713177800178528, "rewards/accuracies": 1.0, "rewards/chosen": -0.02747299149632454, "rewards/margins": 0.044344138354063034, "rewards/rejected": -0.07181712985038757, "sft_loss": 0.5494598150253296, "step": 3470 }, { "epoch": 2.7800000000000002, "grad_norm": 6.027330370816976, "learning_rate": 8.146195134284052e-08, "logits/chosen": -1.251039981842041, "logits/rejected": -1.2202528715133667, "logps/chosen": -0.4107237458229065, "logps/rejected": -1.591752052307129, "loss": 0.4984, "odds_ratio_loss": 0.1679602563381195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.020536188036203384, "rewards/margins": 0.05905142426490784, "rewards/rejected": -0.07958760112524033, "sft_loss": 0.4107237458229065, "step": 3475 }, { "epoch": 2.784, "grad_norm": 5.9048587954628315, "learning_rate": 7.854209717842231e-08, "logits/chosen": -0.8869150280952454, "logits/rejected": -1.0556492805480957, "logps/chosen": -0.4128708243370056, "logps/rejected": -1.244500756263733, "loss": 0.4525, "odds_ratio_loss": 0.25618261098861694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02064354531466961, "rewards/margins": 0.041581496596336365, "rewards/rejected": -0.06222504377365112, "sft_loss": 0.4128708243370056, "step": 3480 }, { "epoch": 2.7880000000000003, "grad_norm": 5.609907193598404, "learning_rate": 7.567469577325598e-08, "logits/chosen": -1.0691337585449219, "logits/rejected": -1.0011084079742432, "logps/chosen": -0.2602657973766327, "logps/rejected": -1.5600097179412842, "loss": 0.4698, "odds_ratio_loss": 0.10053672641515732, "rewards/accuracies": 1.0, "rewards/chosen": -0.013013291172683239, "rewards/margins": 0.0649871900677681, "rewards/rejected": -0.07800048589706421, "sft_loss": 0.2602657973766327, "step": 3485 }, { "epoch": 2.792, "grad_norm": 6.288586916197363, "learning_rate": 7.285980923996989e-08, "logits/chosen": -0.814477801322937, "logits/rejected": -1.0002424716949463, "logps/chosen": -0.39793896675109863, "logps/rejected": -1.5965421199798584, "loss": 0.4963, "odds_ratio_loss": 0.15374401211738586, "rewards/accuracies": 1.0, "rewards/chosen": -0.01989694871008396, "rewards/margins": 0.05993015691637993, "rewards/rejected": -0.07982710003852844, "sft_loss": 0.39793896675109863, "step": 3490 }, { "epoch": 2.7960000000000003, "grad_norm": 7.196669432259674, "learning_rate": 7.009749855363457e-08, "logits/chosen": -0.8080123066902161, "logits/rejected": -0.828715980052948, "logps/chosen": -0.4427156448364258, "logps/rejected": -1.4885919094085693, "loss": 0.467, "odds_ratio_loss": 0.21041274070739746, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02213578298687935, "rewards/margins": 0.0522938147187233, "rewards/rejected": -0.07442959398031235, "sft_loss": 0.4427156448364258, "step": 3495 }, { "epoch": 2.8, "grad_norm": 6.610897880824554, "learning_rate": 6.738782355044048e-08, "logits/chosen": -0.7845413088798523, "logits/rejected": -1.2522486448287964, "logps/chosen": -0.19464164972305298, "logps/rejected": -1.954445242881775, "loss": 0.5121, "odds_ratio_loss": 0.07869584858417511, "rewards/accuracies": 1.0, "rewards/chosen": -0.009732084348797798, "rewards/margins": 0.0879901796579361, "rewards/rejected": -0.09772225469350815, "sft_loss": 0.19464164972305298, "step": 3500 }, { "epoch": 2.8040000000000003, "grad_norm": 5.994838172411329, "learning_rate": 6.47308429264032e-08, "logits/chosen": -0.6243001818656921, "logits/rejected": -0.9205516576766968, "logps/chosen": -0.45419034361839294, "logps/rejected": -1.346792221069336, "loss": 0.4879, "odds_ratio_loss": 0.21579177677631378, "rewards/accuracies": 1.0, "rewards/chosen": -0.022709516808390617, "rewards/margins": 0.04463009163737297, "rewards/rejected": -0.06733961403369904, "sft_loss": 0.45419034361839294, "step": 3505 }, { "epoch": 2.808, "grad_norm": 5.18964559816438, "learning_rate": 6.212661423609184e-08, "logits/chosen": -0.7687469720840454, "logits/rejected": -0.9978445172309875, "logps/chosen": -0.38948315382003784, "logps/rejected": -1.690354585647583, "loss": 0.4909, "odds_ratio_loss": 0.1466546207666397, "rewards/accuracies": 1.0, "rewards/chosen": -0.01947415992617607, "rewards/margins": 0.06504356116056442, "rewards/rejected": -0.08451772481203079, "sft_loss": 0.38948315382003784, "step": 3510 }, { "epoch": 2.8120000000000003, "grad_norm": 4.278585362133975, "learning_rate": 5.957519389138106e-08, "logits/chosen": -0.8727335929870605, "logits/rejected": -0.9041558504104614, "logps/chosen": -0.5910320281982422, "logps/rejected": -1.4024748802185059, "loss": 0.468, "odds_ratio_loss": 0.2644711434841156, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02955160103738308, "rewards/margins": 0.0405721440911293, "rewards/rejected": -0.07012374699115753, "sft_loss": 0.5910320281982422, "step": 3515 }, { "epoch": 2.816, "grad_norm": 8.621278806090325, "learning_rate": 5.707663716023021e-08, "logits/chosen": -0.7765305042266846, "logits/rejected": -1.2130156755447388, "logps/chosen": -0.4113030433654785, "logps/rejected": -1.911913275718689, "loss": 0.4462, "odds_ratio_loss": 0.16190369427204132, "rewards/accuracies": 1.0, "rewards/chosen": -0.020565154030919075, "rewards/margins": 0.07503052800893784, "rewards/rejected": -0.09559567272663116, "sft_loss": 0.4113030433654785, "step": 3520 }, { "epoch": 2.82, "grad_norm": 5.895614935922102, "learning_rate": 5.463099816548578e-08, "logits/chosen": -0.8122976422309875, "logits/rejected": -1.2246438264846802, "logps/chosen": -0.6956444978713989, "logps/rejected": -1.742977499961853, "loss": 0.4596, "odds_ratio_loss": 0.2761508822441101, "rewards/accuracies": 1.0, "rewards/chosen": -0.03478222340345383, "rewards/margins": 0.05236666277050972, "rewards/rejected": -0.08714888989925385, "sft_loss": 0.6956444978713989, "step": 3525 }, { "epoch": 2.824, "grad_norm": 5.257809415391807, "learning_rate": 5.22383298837098e-08, "logits/chosen": -1.0394823551177979, "logits/rejected": -1.370062232017517, "logps/chosen": -0.6062598824501038, "logps/rejected": -1.899864912033081, "loss": 0.5637, "odds_ratio_loss": 0.30006757378578186, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030312994495034218, "rewards/margins": 0.06468025594949722, "rewards/rejected": -0.0949932411313057, "sft_loss": 0.6062598824501038, "step": 3530 }, { "epoch": 2.828, "grad_norm": 5.075614233899885, "learning_rate": 4.989868414403048e-08, "logits/chosen": -0.7696380615234375, "logits/rejected": -0.9930199384689331, "logps/chosen": -0.5346277952194214, "logps/rejected": -2.1930575370788574, "loss": 0.5216, "odds_ratio_loss": 0.2663384974002838, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02673139050602913, "rewards/margins": 0.08292149752378464, "rewards/rejected": -0.10965289175510406, "sft_loss": 0.5346277952194214, "step": 3535 }, { "epoch": 2.832, "grad_norm": 4.417337663334965, "learning_rate": 4.761211162702117e-08, "logits/chosen": -1.0091744661331177, "logits/rejected": -0.9187496900558472, "logps/chosen": -0.3657459616661072, "logps/rejected": -1.539928674697876, "loss": 0.5018, "odds_ratio_loss": 0.1328873336315155, "rewards/accuracies": 1.0, "rewards/chosen": -0.018287301063537598, "rewards/margins": 0.05870913341641426, "rewards/rejected": -0.07699643075466156, "sft_loss": 0.3657459616661072, "step": 3540 }, { "epoch": 2.836, "grad_norm": 4.546971390558208, "learning_rate": 4.537866186360207e-08, "logits/chosen": -0.7591463327407837, "logits/rejected": -0.7273364067077637, "logps/chosen": -0.42804569005966187, "logps/rejected": -1.2514833211898804, "loss": 0.4635, "odds_ratio_loss": 0.2270592749118805, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02140228822827339, "rewards/margins": 0.04117188602685928, "rewards/rejected": -0.06257417052984238, "sft_loss": 0.42804569005966187, "step": 3545 }, { "epoch": 2.84, "grad_norm": 8.001000670131893, "learning_rate": 4.319838323396691e-08, "logits/chosen": -1.2595059871673584, "logits/rejected": -0.9697495698928833, "logps/chosen": -0.4693234860897064, "logps/rejected": -1.4664093255996704, "loss": 0.5123, "odds_ratio_loss": 0.27086126804351807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02346617542207241, "rewards/margins": 0.04985428601503372, "rewards/rejected": -0.07332046329975128, "sft_loss": 0.4693234860897064, "step": 3550 }, { "epoch": 2.844, "grad_norm": 15.587957013391982, "learning_rate": 4.1071322966535487e-08, "logits/chosen": -0.996113657951355, "logits/rejected": -0.6731228828430176, "logps/chosen": -0.2613915801048279, "logps/rejected": -1.2313883304595947, "loss": 0.432, "odds_ratio_loss": 0.12515100836753845, "rewards/accuracies": 1.0, "rewards/chosen": -0.013069577515125275, "rewards/margins": 0.04849984496831894, "rewards/rejected": -0.061569422483444214, "sft_loss": 0.2613915801048279, "step": 3555 }, { "epoch": 2.848, "grad_norm": 7.371630683017223, "learning_rate": 3.8997527136930004e-08, "logits/chosen": -0.9078682065010071, "logits/rejected": -0.7578974962234497, "logps/chosen": -0.3508548438549042, "logps/rejected": -1.5415148735046387, "loss": 0.4796, "odds_ratio_loss": 0.122395358979702, "rewards/accuracies": 1.0, "rewards/chosen": -0.01754274033010006, "rewards/margins": 0.059532999992370605, "rewards/rejected": -0.07707574218511581, "sft_loss": 0.3508548438549042, "step": 3560 }, { "epoch": 2.852, "grad_norm": 8.0726795268241, "learning_rate": 3.6977040666977546e-08, "logits/chosen": -1.1534029245376587, "logits/rejected": -1.2099335193634033, "logps/chosen": -0.5775087475776672, "logps/rejected": -1.39055597782135, "loss": 0.4869, "odds_ratio_loss": 0.2733613848686218, "rewards/accuracies": 1.0, "rewards/chosen": -0.028875436633825302, "rewards/margins": 0.04065236449241638, "rewards/rejected": -0.06952779740095139, "sft_loss": 0.5775087475776672, "step": 3565 }, { "epoch": 2.856, "grad_norm": 11.24865948956023, "learning_rate": 3.5009907323737826e-08, "logits/chosen": -0.552277684211731, "logits/rejected": -1.1681157350540161, "logps/chosen": -0.6316530108451843, "logps/rejected": -1.6660516262054443, "loss": 0.5061, "odds_ratio_loss": 0.2643664479255676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03158264979720116, "rewards/margins": 0.05171992629766464, "rewards/rejected": -0.0833025798201561, "sft_loss": 0.6316530108451843, "step": 3570 }, { "epoch": 2.86, "grad_norm": 8.187980250866474, "learning_rate": 3.309616971855195e-08, "logits/chosen": -0.9434603452682495, "logits/rejected": -1.0135250091552734, "logps/chosen": -0.23478789627552032, "logps/rejected": -2.0454790592193604, "loss": 0.4638, "odds_ratio_loss": 0.12218568474054337, "rewards/accuracies": 1.0, "rewards/chosen": -0.011739394627511501, "rewards/margins": 0.09053455293178558, "rewards/rejected": -0.10227394104003906, "sft_loss": 0.23478789627552032, "step": 3575 }, { "epoch": 2.864, "grad_norm": 10.077811295456423, "learning_rate": 3.1235869306123766e-08, "logits/chosen": -1.0300368070602417, "logits/rejected": -0.8315317034721375, "logps/chosen": -0.5715829133987427, "logps/rejected": -1.4090276956558228, "loss": 0.4626, "odds_ratio_loss": 0.3393372893333435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.028579145669937134, "rewards/margins": 0.04187224060297012, "rewards/rejected": -0.07045139372348785, "sft_loss": 0.5715829133987427, "step": 3580 }, { "epoch": 2.868, "grad_norm": 6.609706143071862, "learning_rate": 2.9429046383618042e-08, "logits/chosen": -1.0845788717269897, "logits/rejected": -1.40879487991333, "logps/chosen": -0.467085063457489, "logps/rejected": -1.8588714599609375, "loss": 0.5266, "odds_ratio_loss": 0.16859348118305206, "rewards/accuracies": 1.0, "rewards/chosen": -0.02335425093770027, "rewards/margins": 0.06958932429552078, "rewards/rejected": -0.09294357895851135, "sft_loss": 0.467085063457489, "step": 3585 }, { "epoch": 2.872, "grad_norm": 4.974574218409692, "learning_rate": 2.767574008979007e-08, "logits/chosen": -0.8864003419876099, "logits/rejected": -1.0464935302734375, "logps/chosen": -0.5476582050323486, "logps/rejected": -1.9629188776016235, "loss": 0.4826, "odds_ratio_loss": 0.19431143999099731, "rewards/accuracies": 1.0, "rewards/chosen": -0.02738291397690773, "rewards/margins": 0.07076303660869598, "rewards/rejected": -0.09814594686031342, "sft_loss": 0.5476582050323486, "step": 3590 }, { "epoch": 2.876, "grad_norm": 5.391207506724364, "learning_rate": 2.59759884041369e-08, "logits/chosen": -0.8431358337402344, "logits/rejected": -0.9085556864738464, "logps/chosen": -0.26856502890586853, "logps/rejected": -1.4795863628387451, "loss": 0.442, "odds_ratio_loss": 0.1016441136598587, "rewards/accuracies": 1.0, "rewards/chosen": -0.01342825312167406, "rewards/margins": 0.060551077127456665, "rewards/rejected": -0.07397932559251785, "sft_loss": 0.26856502890586853, "step": 3595 }, { "epoch": 2.88, "grad_norm": 4.308262440272032, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -0.7681323289871216, "logits/rejected": -0.8165963292121887, "logps/chosen": -0.3159729242324829, "logps/rejected": -1.6058391332626343, "loss": 0.4946, "odds_ratio_loss": 0.14674882590770721, "rewards/accuracies": 1.0, "rewards/chosen": -0.015798646956682205, "rewards/margins": 0.06449331343173981, "rewards/rejected": -0.08029197156429291, "sft_loss": 0.3159729242324829, "step": 3600 }, { "epoch": 2.884, "grad_norm": 4.854650051917074, "learning_rate": 2.2737294974140013e-08, "logits/chosen": -0.9245132207870483, "logits/rejected": -0.9439831972122192, "logps/chosen": -0.5168892741203308, "logps/rejected": -1.4722706079483032, "loss": 0.6122, "odds_ratio_loss": 0.26270395517349243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02584446594119072, "rewards/margins": 0.04776906222105026, "rewards/rejected": -0.07361352443695068, "sft_loss": 0.5168892741203308, "step": 3605 }, { "epoch": 2.888, "grad_norm": 4.810264838940049, "learning_rate": 2.1198423385220822e-08, "logits/chosen": -0.5693929195404053, "logits/rejected": -0.8137924075126648, "logps/chosen": -0.5562976598739624, "logps/rejected": -1.870003342628479, "loss": 0.4924, "odds_ratio_loss": 0.1856372058391571, "rewards/accuracies": 1.0, "rewards/chosen": -0.02781488373875618, "rewards/margins": 0.06568528711795807, "rewards/rejected": -0.09350016713142395, "sft_loss": 0.5562976598739624, "step": 3610 }, { "epoch": 2.892, "grad_norm": 4.590085289404893, "learning_rate": 1.9713246713805588e-08, "logits/chosen": -0.7041809558868408, "logits/rejected": -0.819964587688446, "logps/chosen": -0.5815272331237793, "logps/rejected": -1.5759763717651367, "loss": 0.4648, "odds_ratio_loss": 0.21994754672050476, "rewards/accuracies": 1.0, "rewards/chosen": -0.029076358303427696, "rewards/margins": 0.04972246289253235, "rewards/rejected": -0.0787988156080246, "sft_loss": 0.5815272331237793, "step": 3615 }, { "epoch": 2.896, "grad_norm": 5.1032023997222655, "learning_rate": 1.82817971312621e-08, "logits/chosen": -0.9272521138191223, "logits/rejected": -0.969031810760498, "logps/chosen": -0.3067367672920227, "logps/rejected": -1.5883128643035889, "loss": 0.466, "odds_ratio_loss": 0.13033534586429596, "rewards/accuracies": 1.0, "rewards/chosen": -0.01533683855086565, "rewards/margins": 0.06407880038022995, "rewards/rejected": -0.07941563427448273, "sft_loss": 0.3067367672920227, "step": 3620 }, { "epoch": 2.9, "grad_norm": 5.133104020591015, "learning_rate": 1.6904105645142443e-08, "logits/chosen": -0.6860765218734741, "logits/rejected": -1.004093885421753, "logps/chosen": -0.313940167427063, "logps/rejected": -1.984702706336975, "loss": 0.4767, "odds_ratio_loss": 0.05924931913614273, "rewards/accuracies": 1.0, "rewards/chosen": -0.01569700986146927, "rewards/margins": 0.08353812992572784, "rewards/rejected": -0.09923513978719711, "sft_loss": 0.313940167427063, "step": 3625 }, { "epoch": 2.904, "grad_norm": 6.289524397383519, "learning_rate": 1.5580202098509078e-08, "logits/chosen": -0.6269059181213379, "logits/rejected": -0.887751579284668, "logps/chosen": -0.6534531116485596, "logps/rejected": -1.3601219654083252, "loss": 0.5362, "odds_ratio_loss": 0.3122999668121338, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03267265111207962, "rewards/margins": 0.03533344715833664, "rewards/rejected": -0.06800609827041626, "sft_loss": 0.6534531116485596, "step": 3630 }, { "epoch": 2.908, "grad_norm": 5.321061246172241, "learning_rate": 1.4310115169289263e-08, "logits/chosen": -0.7741699814796448, "logits/rejected": -0.8883988261222839, "logps/chosen": -0.3918963074684143, "logps/rejected": -1.3303663730621338, "loss": 0.4475, "odds_ratio_loss": 0.18605419993400574, "rewards/accuracies": 1.0, "rewards/chosen": -0.019594816491007805, "rewards/margins": 0.04692351073026657, "rewards/rejected": -0.06651832163333893, "sft_loss": 0.3918963074684143, "step": 3635 }, { "epoch": 2.912, "grad_norm": 5.614615596322008, "learning_rate": 1.3093872369654148e-08, "logits/chosen": -0.813896656036377, "logits/rejected": -0.9681650400161743, "logps/chosen": -0.318438321352005, "logps/rejected": -1.9196224212646484, "loss": 0.5048, "odds_ratio_loss": 0.14587683975696564, "rewards/accuracies": 1.0, "rewards/chosen": -0.01592191681265831, "rewards/margins": 0.08005920052528381, "rewards/rejected": -0.09598111361265182, "sft_loss": 0.318438321352005, "step": 3640 }, { "epoch": 2.916, "grad_norm": 6.909146903510477, "learning_rate": 1.193150004542204e-08, "logits/chosen": -0.6660367250442505, "logits/rejected": -1.0153348445892334, "logps/chosen": -0.46080127358436584, "logps/rejected": -1.8787386417388916, "loss": 0.5142, "odds_ratio_loss": 0.22567503154277802, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.023040063679218292, "rewards/margins": 0.07089685648679733, "rewards/rejected": -0.09393692761659622, "sft_loss": 0.46080127358436584, "step": 3645 }, { "epoch": 2.92, "grad_norm": 7.790549829630172, "learning_rate": 1.0823023375489128e-08, "logits/chosen": -0.817104697227478, "logits/rejected": -0.8277327418327332, "logps/chosen": -0.6163122057914734, "logps/rejected": -1.7266231775283813, "loss": 0.4548, "odds_ratio_loss": 0.251958429813385, "rewards/accuracies": 1.0, "rewards/chosen": -0.03081561252474785, "rewards/margins": 0.05551555007696152, "rewards/rejected": -0.08633115142583847, "sft_loss": 0.6163122057914734, "step": 3650 }, { "epoch": 2.924, "grad_norm": 4.529447800983838, "learning_rate": 9.76846637128187e-09, "logits/chosen": -1.1566288471221924, "logits/rejected": -1.1448895931243896, "logps/chosen": -0.6995830535888672, "logps/rejected": -1.5597145557403564, "loss": 0.4972, "odds_ratio_loss": 0.30369362235069275, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03497915342450142, "rewards/margins": 0.043006572872400284, "rewards/rejected": -0.0779857262969017, "sft_loss": 0.6995830535888672, "step": 3655 }, { "epoch": 2.928, "grad_norm": 5.857785314987744, "learning_rate": 8.767851876239075e-09, "logits/chosen": -0.5766550898551941, "logits/rejected": -0.9048866033554077, "logps/chosen": -0.43968862295150757, "logps/rejected": -1.336573839187622, "loss": 0.4655, "odds_ratio_loss": 0.20616094768047333, "rewards/accuracies": 1.0, "rewards/chosen": -0.021984433755278587, "rewards/margins": 0.04484425485134125, "rewards/rejected": -0.06682869046926498, "sft_loss": 0.43968862295150757, "step": 3660 }, { "epoch": 2.932, "grad_norm": 5.159988160646394, "learning_rate": 7.821201565316184e-09, "logits/chosen": -1.0310778617858887, "logits/rejected": -0.7640330791473389, "logps/chosen": -0.3315303921699524, "logps/rejected": -1.6290994882583618, "loss": 0.4037, "odds_ratio_loss": 0.10144094377756119, "rewards/accuracies": 1.0, "rewards/chosen": -0.01657651923596859, "rewards/margins": 0.064878448843956, "rewards/rejected": -0.08145496994256973, "sft_loss": 0.3315303921699524, "step": 3665 }, { "epoch": 2.936, "grad_norm": 18.981926903930404, "learning_rate": 6.9285359445145366e-09, "logits/chosen": -0.8311254382133484, "logits/rejected": -0.9923003911972046, "logps/chosen": -0.5326222777366638, "logps/rejected": -1.6575853824615479, "loss": 0.4691, "odds_ratio_loss": 0.2388612926006317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02663111314177513, "rewards/margins": 0.056248150765895844, "rewards/rejected": -0.08287926018238068, "sft_loss": 0.5326222777366638, "step": 3670 }, { "epoch": 2.94, "grad_norm": 4.65539553385166, "learning_rate": 6.089874350439507e-09, "logits/chosen": -0.8828865885734558, "logits/rejected": -1.2277076244354248, "logps/chosen": -0.44972023367881775, "logps/rejected": -1.8230899572372437, "loss": 0.4531, "odds_ratio_loss": 0.0991957038640976, "rewards/accuracies": 1.0, "rewards/chosen": -0.022486012428998947, "rewards/margins": 0.06866848468780518, "rewards/rejected": -0.09115449339151382, "sft_loss": 0.44972023367881775, "step": 3675 }, { "epoch": 2.944, "grad_norm": 7.036020184151846, "learning_rate": 5.305234949880001e-09, "logits/chosen": -0.8388309478759766, "logits/rejected": -0.784132719039917, "logps/chosen": -0.5297240018844604, "logps/rejected": -1.3611842393875122, "loss": 0.4606, "odds_ratio_loss": 0.23987647891044617, "rewards/accuracies": 1.0, "rewards/chosen": -0.02648620307445526, "rewards/margins": 0.04157300293445587, "rewards/rejected": -0.06805920600891113, "sft_loss": 0.5297240018844604, "step": 3680 }, { "epoch": 2.948, "grad_norm": 8.642237728292242, "learning_rate": 4.57463473941544e-09, "logits/chosen": -1.135202407836914, "logits/rejected": -0.9565531611442566, "logps/chosen": -0.48526114225387573, "logps/rejected": -1.3105952739715576, "loss": 0.5389, "odds_ratio_loss": 0.2372247278690338, "rewards/accuracies": 1.0, "rewards/chosen": -0.024263057857751846, "rewards/margins": 0.04126670956611633, "rewards/rejected": -0.06552976369857788, "sft_loss": 0.48526114225387573, "step": 3685 }, { "epoch": 2.952, "grad_norm": 4.384021145091059, "learning_rate": 3.8980895450474455e-09, "logits/chosen": -0.7918862104415894, "logits/rejected": -1.0930888652801514, "logps/chosen": -0.47447633743286133, "logps/rejected": -1.5464060306549072, "loss": 0.4936, "odds_ratio_loss": 0.19870418310165405, "rewards/accuracies": 1.0, "rewards/chosen": -0.023723818361759186, "rewards/margins": 0.053596485406160355, "rewards/rejected": -0.07732030004262924, "sft_loss": 0.47447633743286133, "step": 3690 }, { "epoch": 2.956, "grad_norm": 5.606541344203649, "learning_rate": 3.275614021857609e-09, "logits/chosen": -1.0226024389266968, "logits/rejected": -1.014661192893982, "logps/chosen": -0.8057661056518555, "logps/rejected": -1.672181487083435, "loss": 0.5466, "odds_ratio_loss": 0.33504363894462585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.040288303047418594, "rewards/margins": 0.04332076758146286, "rewards/rejected": -0.08360908180475235, "sft_loss": 0.8057661056518555, "step": 3695 }, { "epoch": 2.96, "grad_norm": 6.6670138700200035, "learning_rate": 2.7072216536885855e-09, "logits/chosen": -0.6843828558921814, "logits/rejected": -0.8663504719734192, "logps/chosen": -0.4747343063354492, "logps/rejected": -1.3466534614562988, "loss": 0.3528, "odds_ratio_loss": 0.2356620728969574, "rewards/accuracies": 1.0, "rewards/chosen": -0.02373671717941761, "rewards/margins": 0.04359595477581024, "rewards/rejected": -0.0673326700925827, "sft_loss": 0.4747343063354492, "step": 3700 }, { "epoch": 2.964, "grad_norm": 7.141276758716209, "learning_rate": 2.192924752854042e-09, "logits/chosen": -0.7393054962158203, "logits/rejected": -1.508094072341919, "logps/chosen": -0.49731236696243286, "logps/rejected": -1.8641046285629272, "loss": 0.4864, "odds_ratio_loss": 0.12721852958202362, "rewards/accuracies": 1.0, "rewards/chosen": -0.02486562170088291, "rewards/margins": 0.06833961606025696, "rewards/rejected": -0.09320523589849472, "sft_loss": 0.49731236696243286, "step": 3705 }, { "epoch": 2.968, "grad_norm": 5.266798743387709, "learning_rate": 1.7327344598702667e-09, "logits/chosen": -1.0665310621261597, "logits/rejected": -0.858269989490509, "logps/chosen": -0.536304235458374, "logps/rejected": -1.4411725997924805, "loss": 0.4227, "odds_ratio_loss": 0.328524649143219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.026815209537744522, "rewards/margins": 0.04524341970682144, "rewards/rejected": -0.07205863296985626, "sft_loss": 0.536304235458374, "step": 3710 }, { "epoch": 2.972, "grad_norm": 6.750661652998483, "learning_rate": 1.3266607432155243e-09, "logits/chosen": -0.6614164113998413, "logits/rejected": -0.7348114848136902, "logps/chosen": -0.39280936121940613, "logps/rejected": -1.426816463470459, "loss": 0.3934, "odds_ratio_loss": 0.1887359321117401, "rewards/accuracies": 1.0, "rewards/chosen": -0.019640469923615456, "rewards/margins": 0.05170035362243652, "rewards/rejected": -0.07134082168340683, "sft_loss": 0.39280936121940613, "step": 3715 }, { "epoch": 2.976, "grad_norm": 6.207932687783427, "learning_rate": 9.747123991141193e-10, "logits/chosen": -0.7722820043563843, "logits/rejected": -0.7826868295669556, "logps/chosen": -0.45992159843444824, "logps/rejected": -1.4466217756271362, "loss": 0.5124, "odds_ratio_loss": 0.19496043026447296, "rewards/accuracies": 1.0, "rewards/chosen": -0.02299608290195465, "rewards/margins": 0.04933501034975052, "rewards/rejected": -0.07233108580112457, "sft_loss": 0.45992159843444824, "step": 3720 }, { "epoch": 2.98, "grad_norm": 5.007278723928615, "learning_rate": 6.768970513457151e-10, "logits/chosen": -0.8529748916625977, "logits/rejected": -1.0278675556182861, "logps/chosen": -0.3574032783508301, "logps/rejected": -1.736302137374878, "loss": 0.4984, "odds_ratio_loss": 0.11382756382226944, "rewards/accuracies": 1.0, "rewards/chosen": -0.017870163545012474, "rewards/margins": 0.06894494593143463, "rewards/rejected": -0.08681510388851166, "sft_loss": 0.3574032783508301, "step": 3725 }, { "epoch": 2.984, "grad_norm": 5.762199382824231, "learning_rate": 4.332211510807427e-10, "logits/chosen": -0.7780022025108337, "logits/rejected": -1.1421865224838257, "logps/chosen": -0.7099230885505676, "logps/rejected": -1.55299711227417, "loss": 0.546, "odds_ratio_loss": 0.3306809067726135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03549615666270256, "rewards/margins": 0.042153697460889816, "rewards/rejected": -0.07764985412359238, "sft_loss": 0.7099230885505676, "step": 3730 }, { "epoch": 2.988, "grad_norm": 4.682934344503231, "learning_rate": 2.43689976739403e-10, "logits/chosen": -0.8084946870803833, "logits/rejected": -1.319000482559204, "logps/chosen": -0.4620262682437897, "logps/rejected": -1.2795708179473877, "loss": 0.4644, "odds_ratio_loss": 0.23649486899375916, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.023101316764950752, "rewards/margins": 0.04087722301483154, "rewards/rejected": -0.06397853791713715, "sft_loss": 0.4620262682437897, "step": 3735 }, { "epoch": 2.992, "grad_norm": 6.157538833749046, "learning_rate": 1.0830763387897902e-10, "logits/chosen": -0.8642560839653015, "logits/rejected": -1.1312994956970215, "logps/chosen": -0.3675539791584015, "logps/rejected": -1.7602100372314453, "loss": 0.5132, "odds_ratio_loss": 0.23475000262260437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.018377700820565224, "rewards/margins": 0.06963280588388443, "rewards/rejected": -0.0880105048418045, "sft_loss": 0.3675539791584015, "step": 3740 }, { "epoch": 2.996, "grad_norm": 5.903707133419837, "learning_rate": 2.7077055103075233e-11, "logits/chosen": -0.7607343792915344, "logits/rejected": -1.1049892902374268, "logps/chosen": -0.606769323348999, "logps/rejected": -1.8843456506729126, "loss": 0.4721, "odds_ratio_loss": 0.20161142945289612, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03033846616744995, "rewards/margins": 0.06387881934642792, "rewards/rejected": -0.09421730041503906, "sft_loss": 0.606769323348999, "step": 3745 }, { "epoch": 3.0, "grad_norm": 4.814195311056904, "learning_rate": 0.0, "logits/chosen": -1.1040897369384766, "logits/rejected": -1.2379401922225952, "logps/chosen": -0.4253109097480774, "logps/rejected": -1.3982548713684082, "loss": 0.4811, "odds_ratio_loss": 0.22021734714508057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02126554399728775, "rewards/margins": 0.0486472025513649, "rewards/rejected": -0.06991274654865265, "sft_loss": 0.4253109097480774, "step": 3750 }, { "epoch": 3.0, "step": 3750, "total_flos": 126401337753600.0, "train_loss": 0.8602390615145366, "train_runtime": 16444.8476, "train_samples_per_second": 3.649, "train_steps_per_second": 0.228 } ], "logging_steps": 5, "max_steps": 3750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100.0, "total_flos": 126401337753600.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }