diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12780 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 18.786390881708563, + "learning_rate": 6.666666666666668e-08, + "logits/chosen": -0.5580782294273376, + "logits/rejected": -0.7519971132278442, + "logps/chosen": -1.739689588546753, + "logps/rejected": -2.5574848651885986, + "loss": 1.6179, + "odds_ratio_loss": 0.6882386207580566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08698447048664093, + "rewards/margins": 0.040889762341976166, + "rewards/rejected": -0.1278742551803589, + "sft_loss": 1.739689588546753, + "step": 5 + }, + { + "epoch": 0.008, + "grad_norm": 23.4203092044018, + "learning_rate": 1.3333333333333336e-07, + "logits/chosen": -0.33697596192359924, + "logits/rejected": -0.5306824445724487, + "logps/chosen": -1.5324140787124634, + "logps/rejected": -1.5423911809921265, + "loss": 1.6397, + "odds_ratio_loss": 0.6955539584159851, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07662070542573929, + "rewards/margins": 0.0004988554865121841, + "rewards/rejected": -0.07711955904960632, + "sft_loss": 1.5324140787124634, + "step": 10 + }, + { + "epoch": 0.012, + "grad_norm": 115.39311996590018, + "learning_rate": 2.0000000000000002e-07, + "logits/chosen": -0.46800583600997925, + "logits/rejected": -0.6147341728210449, + "logps/chosen": -1.223116397857666, + "logps/rejected": -1.5311689376831055, + "loss": 1.8705, + "odds_ratio_loss": 0.5750418901443481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06115582585334778, + "rewards/margins": 0.015402625314891338, + "rewards/rejected": -0.07655844837427139, + "sft_loss": 1.223116397857666, + "step": 15 + }, + { + "epoch": 0.016, + "grad_norm": 60.195715309742106, + "learning_rate": 2.666666666666667e-07, + "logits/chosen": -0.699694812297821, + "logits/rejected": -0.5708433985710144, + "logps/chosen": -2.690779209136963, + "logps/rejected": -2.1934876441955566, + "loss": 1.7322, + "odds_ratio_loss": 1.3645074367523193, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1345389485359192, + "rewards/margins": -0.02486458420753479, + "rewards/rejected": -0.1096743792295456, + "sft_loss": 2.690779209136963, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 136.78792197913728, + "learning_rate": 3.3333333333333335e-07, + "logits/chosen": -0.4181106686592102, + "logits/rejected": -0.6348497867584229, + "logps/chosen": -1.2124601602554321, + "logps/rejected": -1.4138513803482056, + "loss": 1.8234, + "odds_ratio_loss": 0.6432029008865356, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.060623008757829666, + "rewards/margins": 0.010069566778838634, + "rewards/rejected": -0.07069256901741028, + "sft_loss": 1.2124601602554321, + "step": 25 + }, + { + "epoch": 0.024, + "grad_norm": 82.77355067386294, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -0.43863534927368164, + "logits/rejected": -0.43456798791885376, + "logps/chosen": -1.6070562601089478, + "logps/rejected": -1.8459875583648682, + "loss": 1.6574, + "odds_ratio_loss": 0.6512861251831055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08035281300544739, + "rewards/margins": 0.011946573853492737, + "rewards/rejected": -0.09229937940835953, + "sft_loss": 1.6070562601089478, + "step": 30 + }, + { + "epoch": 0.028, + "grad_norm": 23.36137001588406, + "learning_rate": 4.666666666666667e-07, + "logits/chosen": -0.6327546238899231, + "logits/rejected": -0.5695880055427551, + "logps/chosen": -1.2451648712158203, + "logps/rejected": -1.3754527568817139, + "loss": 1.5305, + "odds_ratio_loss": 0.7118954658508301, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.062258243560791016, + "rewards/margins": 0.006514391396194696, + "rewards/rejected": -0.06877263635396957, + "sft_loss": 1.2451648712158203, + "step": 35 + }, + { + "epoch": 0.032, + "grad_norm": 36.18385215531129, + "learning_rate": 5.333333333333335e-07, + "logits/chosen": -0.5442869067192078, + "logits/rejected": -0.5034470558166504, + "logps/chosen": -1.5560173988342285, + "logps/rejected": -1.7619720697402954, + "loss": 1.4824, + "odds_ratio_loss": 0.7844841480255127, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07780086249113083, + "rewards/margins": 0.010297740809619427, + "rewards/rejected": -0.08809860795736313, + "sft_loss": 1.5560173988342285, + "step": 40 + }, + { + "epoch": 0.036, + "grad_norm": 13.148829953179805, + "learning_rate": 6.000000000000001e-07, + "logits/chosen": -0.6464945673942566, + "logits/rejected": -0.6559021472930908, + "logps/chosen": -1.3128869533538818, + "logps/rejected": -1.5577877759933472, + "loss": 1.4002, + "odds_ratio_loss": 0.7039626836776733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06564434617757797, + "rewards/margins": 0.012245049700140953, + "rewards/rejected": -0.07788939774036407, + "sft_loss": 1.3128869533538818, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 31.644096608034133, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -0.6939803957939148, + "logits/rejected": -0.9392184019088745, + "logps/chosen": -1.2749038934707642, + "logps/rejected": -1.8819398880004883, + "loss": 1.4027, + "odds_ratio_loss": 0.5089942812919617, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06374519318342209, + "rewards/margins": 0.030351802706718445, + "rewards/rejected": -0.09409699589014053, + "sft_loss": 1.2749038934707642, + "step": 50 + }, + { + "epoch": 0.044, + "grad_norm": 62.42905881663458, + "learning_rate": 7.333333333333334e-07, + "logits/chosen": -0.8882333636283875, + "logits/rejected": -0.8815867304801941, + "logps/chosen": -1.6295671463012695, + "logps/rejected": -1.6508338451385498, + "loss": 1.53, + "odds_ratio_loss": 0.8277268409729004, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.08147837221622467, + "rewards/margins": 0.0010633214842528105, + "rewards/rejected": -0.08254168927669525, + "sft_loss": 1.6295671463012695, + "step": 55 + }, + { + "epoch": 0.048, + "grad_norm": 16.729835988532628, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -1.0203667879104614, + "logits/rejected": -0.8139181137084961, + "logps/chosen": -1.0937988758087158, + "logps/rejected": -1.4742703437805176, + "loss": 1.3953, + "odds_ratio_loss": 0.49027538299560547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05468994379043579, + "rewards/margins": 0.019023580476641655, + "rewards/rejected": -0.073713518679142, + "sft_loss": 1.0937988758087158, + "step": 60 + }, + { + "epoch": 0.052, + "grad_norm": 13.431687226067643, + "learning_rate": 8.666666666666668e-07, + "logits/chosen": -0.6755761504173279, + "logits/rejected": -0.9210033416748047, + "logps/chosen": -1.2131409645080566, + "logps/rejected": -1.709071159362793, + "loss": 1.3022, + "odds_ratio_loss": 0.5299323201179504, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.060657043009996414, + "rewards/margins": 0.024796508252620697, + "rewards/rejected": -0.08545355498790741, + "sft_loss": 1.2131409645080566, + "step": 65 + }, + { + "epoch": 0.056, + "grad_norm": 14.183853012054012, + "learning_rate": 9.333333333333334e-07, + "logits/chosen": -0.6897495985031128, + "logits/rejected": -0.6041379570960999, + "logps/chosen": -1.1399095058441162, + "logps/rejected": -1.4036375284194946, + "loss": 1.2925, + "odds_ratio_loss": 0.5934557914733887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05699547380208969, + "rewards/margins": 0.01318640448153019, + "rewards/rejected": -0.07018186897039413, + "sft_loss": 1.1399095058441162, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 8.539096149562175, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.8109081387519836, + "logits/rejected": -0.8047749400138855, + "logps/chosen": -1.396490216255188, + "logps/rejected": -1.7933557033538818, + "loss": 1.327, + "odds_ratio_loss": 0.5322818756103516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06982450932264328, + "rewards/margins": 0.019843269139528275, + "rewards/rejected": -0.08966778218746185, + "sft_loss": 1.396490216255188, + "step": 75 + }, + { + "epoch": 0.064, + "grad_norm": 16.572964152820063, + "learning_rate": 1.066666666666667e-06, + "logits/chosen": -0.6374383568763733, + "logits/rejected": -0.795744001865387, + "logps/chosen": -1.2772842645645142, + "logps/rejected": -1.3515651226043701, + "loss": 1.3445, + "odds_ratio_loss": 0.7885932922363281, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06386421620845795, + "rewards/margins": 0.0037140310741961002, + "rewards/rejected": -0.06757824867963791, + "sft_loss": 1.2772842645645142, + "step": 80 + }, + { + "epoch": 0.068, + "grad_norm": 11.477331395703159, + "learning_rate": 1.1333333333333334e-06, + "logits/chosen": -0.655463695526123, + "logits/rejected": -0.6941269636154175, + "logps/chosen": -1.2637232542037964, + "logps/rejected": -1.4608389139175415, + "loss": 1.3153, + "odds_ratio_loss": 0.628704845905304, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0631861612200737, + "rewards/margins": 0.009855778887867928, + "rewards/rejected": -0.07304193824529648, + "sft_loss": 1.2637232542037964, + "step": 85 + }, + { + "epoch": 0.072, + "grad_norm": 8.882050736167244, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -0.7108498215675354, + "logits/rejected": -0.7867471575737, + "logps/chosen": -1.346906065940857, + "logps/rejected": -1.1983606815338135, + "loss": 1.233, + "odds_ratio_loss": 1.0082073211669922, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.06734530627727509, + "rewards/margins": -0.007427269127219915, + "rewards/rejected": -0.05991803854703903, + "sft_loss": 1.346906065940857, + "step": 90 + }, + { + "epoch": 0.076, + "grad_norm": 9.355470695768565, + "learning_rate": 1.2666666666666669e-06, + "logits/chosen": -0.5399163961410522, + "logits/rejected": -0.7329431772232056, + "logps/chosen": -1.2461333274841309, + "logps/rejected": -1.0014374256134033, + "loss": 1.2829, + "odds_ratio_loss": 0.9972267150878906, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.06230667233467102, + "rewards/margins": -0.012234793975949287, + "rewards/rejected": -0.050071872770786285, + "sft_loss": 1.2461333274841309, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 21.047192553416995, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -0.4293244779109955, + "logits/rejected": -1.0099979639053345, + "logps/chosen": -1.2779626846313477, + "logps/rejected": -1.6887423992156982, + "loss": 1.2277, + "odds_ratio_loss": 0.48076191544532776, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.06389813870191574, + "rewards/margins": 0.020538978278636932, + "rewards/rejected": -0.08443711698055267, + "sft_loss": 1.2779626846313477, + "step": 100 + }, + { + "epoch": 0.084, + "grad_norm": 7.732739294319649, + "learning_rate": 1.4000000000000001e-06, + "logits/chosen": -0.48744726181030273, + "logits/rejected": -0.6798993349075317, + "logps/chosen": -1.0932471752166748, + "logps/rejected": -1.5973366498947144, + "loss": 1.1933, + "odds_ratio_loss": 0.6056200861930847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05466235801577568, + "rewards/margins": 0.025204479694366455, + "rewards/rejected": -0.07986684143543243, + "sft_loss": 1.0932471752166748, + "step": 105 + }, + { + "epoch": 0.088, + "grad_norm": 17.46330045101831, + "learning_rate": 1.4666666666666669e-06, + "logits/chosen": -0.7039790153503418, + "logits/rejected": -0.4986042380332947, + "logps/chosen": -1.3014500141143799, + "logps/rejected": -1.3300515413284302, + "loss": 1.2646, + "odds_ratio_loss": 0.7541639804840088, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06507251411676407, + "rewards/margins": 0.0014300707262009382, + "rewards/rejected": -0.06650258600711823, + "sft_loss": 1.3014500141143799, + "step": 110 + }, + { + "epoch": 0.092, + "grad_norm": 9.681223889349653, + "learning_rate": 1.5333333333333334e-06, + "logits/chosen": -0.33655840158462524, + "logits/rejected": -0.45631903409957886, + "logps/chosen": -1.1225440502166748, + "logps/rejected": -1.4931358098983765, + "loss": 1.2771, + "odds_ratio_loss": 0.5696986317634583, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05612720176577568, + "rewards/margins": 0.01852959208190441, + "rewards/rejected": -0.07465679943561554, + "sft_loss": 1.1225440502166748, + "step": 115 + }, + { + "epoch": 0.096, + "grad_norm": 6.114934666493648, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -0.23245234787464142, + "logits/rejected": -0.36162400245666504, + "logps/chosen": -0.8415991067886353, + "logps/rejected": -1.6995826959609985, + "loss": 1.3043, + "odds_ratio_loss": 0.4082559049129486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04207995533943176, + "rewards/margins": 0.04289917275309563, + "rewards/rejected": -0.08497913181781769, + "sft_loss": 0.8415991067886353, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 29.566253249742736, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": -0.6754384636878967, + "logits/rejected": -0.5784372091293335, + "logps/chosen": -1.1139187812805176, + "logps/rejected": -1.0610531568527222, + "loss": 1.2067, + "odds_ratio_loss": 0.8141271471977234, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05569593235850334, + "rewards/margins": -0.002643275773152709, + "rewards/rejected": -0.05305265635251999, + "sft_loss": 1.1139187812805176, + "step": 125 + }, + { + "epoch": 0.104, + "grad_norm": 18.345090279015356, + "learning_rate": 1.7333333333333336e-06, + "logits/chosen": -0.5198614597320557, + "logits/rejected": -0.5959798693656921, + "logps/chosen": -1.2163742780685425, + "logps/rejected": -1.0906116962432861, + "loss": 1.3261, + "odds_ratio_loss": 0.8466520309448242, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.060818713158369064, + "rewards/margins": -0.006288123317062855, + "rewards/rejected": -0.054530590772628784, + "sft_loss": 1.2163742780685425, + "step": 130 + }, + { + "epoch": 0.108, + "grad_norm": 14.549007799191262, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": -0.43266358971595764, + "logits/rejected": -0.566035270690918, + "logps/chosen": -1.3842393159866333, + "logps/rejected": -1.4916588068008423, + "loss": 1.3058, + "odds_ratio_loss": 0.6889979243278503, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06921195983886719, + "rewards/margins": 0.005370972212404013, + "rewards/rejected": -0.07458294183015823, + "sft_loss": 1.3842393159866333, + "step": 135 + }, + { + "epoch": 0.112, + "grad_norm": 23.355911973188775, + "learning_rate": 1.8666666666666669e-06, + "logits/chosen": -0.592883825302124, + "logits/rejected": -0.4132818281650543, + "logps/chosen": -1.100903868675232, + "logps/rejected": -1.2986562252044678, + "loss": 1.141, + "odds_ratio_loss": 0.5977322459220886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.055045194923877716, + "rewards/margins": 0.009887613356113434, + "rewards/rejected": -0.06493280827999115, + "sft_loss": 1.100903868675232, + "step": 140 + }, + { + "epoch": 0.116, + "grad_norm": 9.702121972571497, + "learning_rate": 1.9333333333333336e-06, + "logits/chosen": -0.6822474598884583, + "logits/rejected": -0.49178019165992737, + "logps/chosen": -1.1203429698944092, + "logps/rejected": -1.9260637760162354, + "loss": 1.2472, + "odds_ratio_loss": 0.4579285979270935, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05601715296506882, + "rewards/margins": 0.04028604179620743, + "rewards/rejected": -0.09630318731069565, + "sft_loss": 1.1203429698944092, + "step": 145 + }, + { + "epoch": 0.12, + "grad_norm": 16.918077923973826, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.5237475633621216, + "logits/rejected": -0.8934429287910461, + "logps/chosen": -0.8749428987503052, + "logps/rejected": -1.3101131916046143, + "loss": 1.1565, + "odds_ratio_loss": 0.5730590224266052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04374714195728302, + "rewards/margins": 0.021758515387773514, + "rewards/rejected": -0.06550566107034683, + "sft_loss": 0.8749428987503052, + "step": 150 + }, + { + "epoch": 0.124, + "grad_norm": 11.087436159759044, + "learning_rate": 2.0666666666666666e-06, + "logits/chosen": -0.5964301824569702, + "logits/rejected": -0.8087562322616577, + "logps/chosen": -1.5669138431549072, + "logps/rejected": -1.6930387020111084, + "loss": 1.3477, + "odds_ratio_loss": 0.6572802066802979, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07834570109844208, + "rewards/margins": 0.006306241266429424, + "rewards/rejected": -0.08465193212032318, + "sft_loss": 1.5669138431549072, + "step": 155 + }, + { + "epoch": 0.128, + "grad_norm": 11.340859017147654, + "learning_rate": 2.133333333333334e-06, + "logits/chosen": -0.7819768190383911, + "logits/rejected": -0.4041160047054291, + "logps/chosen": -1.2440321445465088, + "logps/rejected": -1.2431968450546265, + "loss": 1.2147, + "odds_ratio_loss": 0.7579740285873413, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0622016116976738, + "rewards/margins": -4.1763483750401065e-05, + "rewards/rejected": -0.06215984374284744, + "sft_loss": 1.2440321445465088, + "step": 160 + }, + { + "epoch": 0.132, + "grad_norm": 11.290804145869599, + "learning_rate": 2.2e-06, + "logits/chosen": -0.3270031809806824, + "logits/rejected": -0.42402464151382446, + "logps/chosen": -1.2503063678741455, + "logps/rejected": -1.2082438468933105, + "loss": 1.3976, + "odds_ratio_loss": 0.7837561368942261, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06251531839370728, + "rewards/margins": -0.00210312707349658, + "rewards/rejected": -0.06041219085454941, + "sft_loss": 1.2503063678741455, + "step": 165 + }, + { + "epoch": 0.136, + "grad_norm": 8.653604673632358, + "learning_rate": 2.266666666666667e-06, + "logits/chosen": -0.4413372874259949, + "logits/rejected": -0.7267617583274841, + "logps/chosen": -1.1855711936950684, + "logps/rejected": -1.0826096534729004, + "loss": 1.2368, + "odds_ratio_loss": 0.8635139465332031, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.059278566390275955, + "rewards/margins": -0.005148076917976141, + "rewards/rejected": -0.05413048714399338, + "sft_loss": 1.1855711936950684, + "step": 170 + }, + { + "epoch": 0.14, + "grad_norm": 18.84252613733039, + "learning_rate": 2.3333333333333336e-06, + "logits/chosen": -0.31860145926475525, + "logits/rejected": -0.42350611090660095, + "logps/chosen": -1.1006211042404175, + "logps/rejected": -1.26455557346344, + "loss": 1.2382, + "odds_ratio_loss": 0.6225001811981201, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05503106117248535, + "rewards/margins": 0.00819671992212534, + "rewards/rejected": -0.06322778016328812, + "sft_loss": 1.1006211042404175, + "step": 175 + }, + { + "epoch": 0.144, + "grad_norm": 8.60596229262369, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -0.6520022749900818, + "logits/rejected": -0.5037721991539001, + "logps/chosen": -1.0799922943115234, + "logps/rejected": -1.3683429956436157, + "loss": 1.2163, + "odds_ratio_loss": 0.6034747362136841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05399961397051811, + "rewards/margins": 0.014417541213333607, + "rewards/rejected": -0.06841715425252914, + "sft_loss": 1.0799922943115234, + "step": 180 + }, + { + "epoch": 0.148, + "grad_norm": 8.352997349835983, + "learning_rate": 2.466666666666667e-06, + "logits/chosen": -0.33638280630111694, + "logits/rejected": -0.3540743589401245, + "logps/chosen": -1.5841834545135498, + "logps/rejected": -1.106182336807251, + "loss": 1.274, + "odds_ratio_loss": 1.2082093954086304, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.07920917868614197, + "rewards/margins": -0.02390005625784397, + "rewards/rejected": -0.05530911684036255, + "sft_loss": 1.5841834545135498, + "step": 185 + }, + { + "epoch": 0.152, + "grad_norm": 10.77421191597458, + "learning_rate": 2.5333333333333338e-06, + "logits/chosen": -0.35303565859794617, + "logits/rejected": -0.5206754803657532, + "logps/chosen": -1.4167323112487793, + "logps/rejected": -1.4747750759124756, + "loss": 1.2796, + "odds_ratio_loss": 0.7247415781021118, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0708366185426712, + "rewards/margins": 0.002902137814089656, + "rewards/rejected": -0.07373875379562378, + "sft_loss": 1.4167323112487793, + "step": 190 + }, + { + "epoch": 0.156, + "grad_norm": 7.1177401499786415, + "learning_rate": 2.6e-06, + "logits/chosen": -0.2510073482990265, + "logits/rejected": -0.5589274168014526, + "logps/chosen": -1.442596197128296, + "logps/rejected": -1.4756193161010742, + "loss": 1.3657, + "odds_ratio_loss": 0.7003253102302551, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07212980836629868, + "rewards/margins": 0.0016511573921889067, + "rewards/rejected": -0.07378096878528595, + "sft_loss": 1.442596197128296, + "step": 195 + }, + { + "epoch": 0.16, + "grad_norm": 16.763946753945877, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -0.45789265632629395, + "logits/rejected": -0.4571969509124756, + "logps/chosen": -1.0912725925445557, + "logps/rejected": -1.1674692630767822, + "loss": 1.3449, + "odds_ratio_loss": 0.7581279873847961, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05456363409757614, + "rewards/margins": 0.0038098313380032778, + "rewards/rejected": -0.05837346240878105, + "sft_loss": 1.0912725925445557, + "step": 200 + }, + { + "epoch": 0.164, + "grad_norm": 7.633189094243207, + "learning_rate": 2.7333333333333336e-06, + "logits/chosen": -0.5795624852180481, + "logits/rejected": -0.6412222981452942, + "logps/chosen": -1.0281997919082642, + "logps/rejected": -1.2789350748062134, + "loss": 1.2025, + "odds_ratio_loss": 0.6383514404296875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05140998959541321, + "rewards/margins": 0.012536766938865185, + "rewards/rejected": -0.06394675374031067, + "sft_loss": 1.0281997919082642, + "step": 205 + }, + { + "epoch": 0.168, + "grad_norm": 22.399131275217997, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -0.3538805842399597, + "logits/rejected": -0.6279109120368958, + "logps/chosen": -1.0936148166656494, + "logps/rejected": -1.2315231561660767, + "loss": 1.273, + "odds_ratio_loss": 0.6472857594490051, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05468074232339859, + "rewards/margins": 0.0068954164162278175, + "rewards/rejected": -0.06157616525888443, + "sft_loss": 1.0936148166656494, + "step": 210 + }, + { + "epoch": 0.172, + "grad_norm": 12.511784021141086, + "learning_rate": 2.866666666666667e-06, + "logits/chosen": -0.6461108326911926, + "logits/rejected": -0.8027406930923462, + "logps/chosen": -1.279821515083313, + "logps/rejected": -1.2249929904937744, + "loss": 1.1567, + "odds_ratio_loss": 0.7108494639396667, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06399108469486237, + "rewards/margins": -0.002741418778896332, + "rewards/rejected": -0.06124965474009514, + "sft_loss": 1.279821515083313, + "step": 215 + }, + { + "epoch": 0.176, + "grad_norm": 18.111790971616198, + "learning_rate": 2.9333333333333338e-06, + "logits/chosen": -0.5222848057746887, + "logits/rejected": -0.5374074578285217, + "logps/chosen": -1.1454452276229858, + "logps/rejected": -1.541313648223877, + "loss": 1.3047, + "odds_ratio_loss": 0.46772265434265137, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.05727226287126541, + "rewards/margins": 0.01979340985417366, + "rewards/rejected": -0.07706567645072937, + "sft_loss": 1.1454452276229858, + "step": 220 + }, + { + "epoch": 0.18, + "grad_norm": 12.734975127781638, + "learning_rate": 3e-06, + "logits/chosen": -0.5977376699447632, + "logits/rejected": -0.2883208096027374, + "logps/chosen": -1.3275381326675415, + "logps/rejected": -1.18563711643219, + "loss": 1.2512, + "odds_ratio_loss": 0.9130814671516418, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06637690961360931, + "rewards/margins": -0.00709505146369338, + "rewards/rejected": -0.0592818558216095, + "sft_loss": 1.3275381326675415, + "step": 225 + }, + { + "epoch": 0.184, + "grad_norm": 21.003312916139876, + "learning_rate": 3.066666666666667e-06, + "logits/chosen": -0.5674949288368225, + "logits/rejected": -0.49964016675949097, + "logps/chosen": -1.375934362411499, + "logps/rejected": -1.5092494487762451, + "loss": 1.3693, + "odds_ratio_loss": 0.7147940993309021, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06879671663045883, + "rewards/margins": 0.006665749941021204, + "rewards/rejected": -0.0754624754190445, + "sft_loss": 1.375934362411499, + "step": 230 + }, + { + "epoch": 0.188, + "grad_norm": 11.904735065181235, + "learning_rate": 3.133333333333334e-06, + "logits/chosen": -0.6791194677352905, + "logits/rejected": -0.7436596155166626, + "logps/chosen": -0.8784946203231812, + "logps/rejected": -1.1814043521881104, + "loss": 1.248, + "odds_ratio_loss": 0.5295432806015015, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.043924733996391296, + "rewards/margins": 0.015145489946007729, + "rewards/rejected": -0.059070222079753876, + "sft_loss": 0.8784946203231812, + "step": 235 + }, + { + "epoch": 0.192, + "grad_norm": 15.144263155627513, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -0.39098218083381653, + "logits/rejected": -0.48908573389053345, + "logps/chosen": -1.4278444051742554, + "logps/rejected": -1.5091187953948975, + "loss": 1.3214, + "odds_ratio_loss": 0.6652258634567261, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07139221578836441, + "rewards/margins": 0.004063720349222422, + "rewards/rejected": -0.0754559338092804, + "sft_loss": 1.4278444051742554, + "step": 240 + }, + { + "epoch": 0.196, + "grad_norm": 7.701817646797151, + "learning_rate": 3.266666666666667e-06, + "logits/chosen": -0.39734160900115967, + "logits/rejected": -0.6708099246025085, + "logps/chosen": -1.1432991027832031, + "logps/rejected": -1.2521193027496338, + "loss": 1.2041, + "odds_ratio_loss": 0.7118695974349976, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.057164955884218216, + "rewards/margins": 0.005441013723611832, + "rewards/rejected": -0.06260596960783005, + "sft_loss": 1.1432991027832031, + "step": 245 + }, + { + "epoch": 0.2, + "grad_norm": 9.281984255417264, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -0.3979361057281494, + "logits/rejected": -0.5376076102256775, + "logps/chosen": -0.9498193860054016, + "logps/rejected": -1.696179986000061, + "loss": 1.3049, + "odds_ratio_loss": 0.5728334188461304, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04749097302556038, + "rewards/margins": 0.03731803596019745, + "rewards/rejected": -0.08480901271104813, + "sft_loss": 0.9498193860054016, + "step": 250 + }, + { + "epoch": 0.204, + "grad_norm": 9.425798997764154, + "learning_rate": 3.4000000000000005e-06, + "logits/chosen": -0.58461594581604, + "logits/rejected": -0.7774965763092041, + "logps/chosen": -1.144986867904663, + "logps/rejected": -1.2843310832977295, + "loss": 1.3384, + "odds_ratio_loss": 0.6974171996116638, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.057249344885349274, + "rewards/margins": 0.006967212073504925, + "rewards/rejected": -0.06421655416488647, + "sft_loss": 1.144986867904663, + "step": 255 + }, + { + "epoch": 0.208, + "grad_norm": 6.829873424161352, + "learning_rate": 3.4666666666666672e-06, + "logits/chosen": -0.4852909445762634, + "logits/rejected": -0.8059079051017761, + "logps/chosen": -1.2967959642410278, + "logps/rejected": -1.1163851022720337, + "loss": 1.2336, + "odds_ratio_loss": 0.985200047492981, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06483979523181915, + "rewards/margins": -0.009020542725920677, + "rewards/rejected": -0.05581926181912422, + "sft_loss": 1.2967959642410278, + "step": 260 + }, + { + "epoch": 0.212, + "grad_norm": 12.53260839362661, + "learning_rate": 3.5333333333333335e-06, + "logits/chosen": -0.8439911603927612, + "logits/rejected": -0.4962303042411804, + "logps/chosen": -1.4761368036270142, + "logps/rejected": -1.4275569915771484, + "loss": 1.2936, + "odds_ratio_loss": 0.8170219659805298, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.07380684465169907, + "rewards/margins": -0.0024289849679917097, + "rewards/rejected": -0.07137785851955414, + "sft_loss": 1.4761368036270142, + "step": 265 + }, + { + "epoch": 0.216, + "grad_norm": 10.909364520509888, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -0.7634373307228088, + "logits/rejected": -0.5086906552314758, + "logps/chosen": -1.543953537940979, + "logps/rejected": -1.3793667554855347, + "loss": 1.4597, + "odds_ratio_loss": 0.8944965600967407, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07719766348600388, + "rewards/margins": -0.008229334838688374, + "rewards/rejected": -0.06896833330392838, + "sft_loss": 1.543953537940979, + "step": 270 + }, + { + "epoch": 0.22, + "grad_norm": 14.036081295560992, + "learning_rate": 3.6666666666666666e-06, + "logits/chosen": -0.920630931854248, + "logits/rejected": -0.5563604831695557, + "logps/chosen": -1.1286569833755493, + "logps/rejected": -1.3310692310333252, + "loss": 1.3055, + "odds_ratio_loss": 0.621965765953064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.056432850658893585, + "rewards/margins": 0.010120616294443607, + "rewards/rejected": -0.06655346602201462, + "sft_loss": 1.1286569833755493, + "step": 275 + }, + { + "epoch": 0.224, + "grad_norm": 7.314465975057194, + "learning_rate": 3.7333333333333337e-06, + "logits/chosen": -0.39631861448287964, + "logits/rejected": -0.6269375681877136, + "logps/chosen": -1.2827950716018677, + "logps/rejected": -1.5547349452972412, + "loss": 1.2209, + "odds_ratio_loss": 0.7333937883377075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06413975358009338, + "rewards/margins": 0.013596994802355766, + "rewards/rejected": -0.0777367502450943, + "sft_loss": 1.2827950716018677, + "step": 280 + }, + { + "epoch": 0.228, + "grad_norm": 8.448913656983327, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": -0.465139240026474, + "logits/rejected": -0.5822895169258118, + "logps/chosen": -1.4464404582977295, + "logps/rejected": -1.5029900074005127, + "loss": 1.2867, + "odds_ratio_loss": 0.6958650350570679, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07232202589511871, + "rewards/margins": 0.002827476244419813, + "rewards/rejected": -0.07514949887990952, + "sft_loss": 1.4464404582977295, + "step": 285 + }, + { + "epoch": 0.232, + "grad_norm": 5.845937122945277, + "learning_rate": 3.866666666666667e-06, + "logits/chosen": -0.43053460121154785, + "logits/rejected": -0.5996862649917603, + "logps/chosen": -1.004393458366394, + "logps/rejected": -1.2151196002960205, + "loss": 1.235, + "odds_ratio_loss": 0.6134498715400696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05021967366337776, + "rewards/margins": 0.010536303743720055, + "rewards/rejected": -0.060755979269742966, + "sft_loss": 1.004393458366394, + "step": 290 + }, + { + "epoch": 0.236, + "grad_norm": 7.768195625026772, + "learning_rate": 3.9333333333333335e-06, + "logits/chosen": -0.3257526755332947, + "logits/rejected": -0.6037822365760803, + "logps/chosen": -2.014780282974243, + "logps/rejected": -1.3312056064605713, + "loss": 1.3802, + "odds_ratio_loss": 1.3934136629104614, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1007390022277832, + "rewards/margins": -0.034178726375103, + "rewards/rejected": -0.0665602907538414, + "sft_loss": 2.014780282974243, + "step": 295 + }, + { + "epoch": 0.24, + "grad_norm": 7.746680686909107, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.733782947063446, + "logits/rejected": -0.6740893125534058, + "logps/chosen": -1.2197449207305908, + "logps/rejected": -1.2922364473342896, + "loss": 1.2703, + "odds_ratio_loss": 0.665755569934845, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06098724529147148, + "rewards/margins": 0.003624574514105916, + "rewards/rejected": -0.06461182236671448, + "sft_loss": 1.2197449207305908, + "step": 300 + }, + { + "epoch": 0.244, + "grad_norm": 13.0956632135183, + "learning_rate": 4.066666666666667e-06, + "logits/chosen": -0.7939602732658386, + "logits/rejected": -0.3044114410877228, + "logps/chosen": -1.1555862426757812, + "logps/rejected": -1.36032235622406, + "loss": 1.321, + "odds_ratio_loss": 0.639162003993988, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05777931213378906, + "rewards/margins": 0.010236804373562336, + "rewards/rejected": -0.06801611930131912, + "sft_loss": 1.1555862426757812, + "step": 305 + }, + { + "epoch": 0.248, + "grad_norm": 11.678408623894509, + "learning_rate": 4.133333333333333e-06, + "logits/chosen": -0.7341225743293762, + "logits/rejected": -0.7172056436538696, + "logps/chosen": -1.210506796836853, + "logps/rejected": -1.2130941152572632, + "loss": 1.3283, + "odds_ratio_loss": 0.751409113407135, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06052535027265549, + "rewards/margins": 0.00012936182611156255, + "rewards/rejected": -0.06065471097826958, + "sft_loss": 1.210506796836853, + "step": 310 + }, + { + "epoch": 0.252, + "grad_norm": 10.485708060104384, + "learning_rate": 4.2000000000000004e-06, + "logits/chosen": -0.7123367190361023, + "logits/rejected": -0.5732913613319397, + "logps/chosen": -1.216386079788208, + "logps/rejected": -1.3553041219711304, + "loss": 1.2446, + "odds_ratio_loss": 0.649107038974762, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06081929802894592, + "rewards/margins": 0.006945909466594458, + "rewards/rejected": -0.06776521354913712, + "sft_loss": 1.216386079788208, + "step": 315 + }, + { + "epoch": 0.256, + "grad_norm": 5.890587593911549, + "learning_rate": 4.266666666666668e-06, + "logits/chosen": -0.5388275384902954, + "logits/rejected": -0.5508286356925964, + "logps/chosen": -1.1097781658172607, + "logps/rejected": -1.4719569683074951, + "loss": 1.1982, + "odds_ratio_loss": 0.5467421412467957, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05548890680074692, + "rewards/margins": 0.01810893975198269, + "rewards/rejected": -0.07359784096479416, + "sft_loss": 1.1097781658172607, + "step": 320 + }, + { + "epoch": 0.26, + "grad_norm": 16.06285773185922, + "learning_rate": 4.333333333333334e-06, + "logits/chosen": -0.7908880114555359, + "logits/rejected": -0.20292505621910095, + "logps/chosen": -2.2549867630004883, + "logps/rejected": -1.2224481105804443, + "loss": 1.5111, + "odds_ratio_loss": 1.8231436014175415, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11274933815002441, + "rewards/margins": -0.05162693187594414, + "rewards/rejected": -0.061122406274080276, + "sft_loss": 2.2549867630004883, + "step": 325 + }, + { + "epoch": 0.264, + "grad_norm": 8.454262342240037, + "learning_rate": 4.4e-06, + "logits/chosen": -0.4531725347042084, + "logits/rejected": -0.431458055973053, + "logps/chosen": -1.2020156383514404, + "logps/rejected": -1.3703901767730713, + "loss": 1.23, + "odds_ratio_loss": 0.5940185785293579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06010078266263008, + "rewards/margins": 0.008418736979365349, + "rewards/rejected": -0.06851951032876968, + "sft_loss": 1.2020156383514404, + "step": 330 + }, + { + "epoch": 0.268, + "grad_norm": 9.90145361257035, + "learning_rate": 4.4666666666666665e-06, + "logits/chosen": -0.5263561010360718, + "logits/rejected": -0.6940143704414368, + "logps/chosen": -0.9874106645584106, + "logps/rejected": -1.1406878232955933, + "loss": 1.2596, + "odds_ratio_loss": 0.8020124435424805, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04937053471803665, + "rewards/margins": 0.007663858123123646, + "rewards/rejected": -0.05703439190983772, + "sft_loss": 0.9874106645584106, + "step": 335 + }, + { + "epoch": 0.272, + "grad_norm": 7.2942480975024475, + "learning_rate": 4.533333333333334e-06, + "logits/chosen": -0.4240873456001282, + "logits/rejected": -0.6028670072555542, + "logps/chosen": -1.3075716495513916, + "logps/rejected": -2.017928123474121, + "loss": 1.2217, + "odds_ratio_loss": 0.5069721341133118, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0653785839676857, + "rewards/margins": 0.035517822951078415, + "rewards/rejected": -0.10089640319347382, + "sft_loss": 1.3075716495513916, + "step": 340 + }, + { + "epoch": 0.276, + "grad_norm": 8.866201564442973, + "learning_rate": 4.600000000000001e-06, + "logits/chosen": -0.7544479370117188, + "logits/rejected": -0.21181067824363708, + "logps/chosen": -1.218145489692688, + "logps/rejected": -1.204552412033081, + "loss": 1.1941, + "odds_ratio_loss": 0.8569790720939636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0609072744846344, + "rewards/margins": -0.0006796553498134017, + "rewards/rejected": -0.06022762134671211, + "sft_loss": 1.218145489692688, + "step": 345 + }, + { + "epoch": 0.28, + "grad_norm": 8.192936983091547, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -0.32254576683044434, + "logits/rejected": -0.6721733212471008, + "logps/chosen": -1.1165757179260254, + "logps/rejected": -1.2479455471038818, + "loss": 1.2225, + "odds_ratio_loss": 0.6464930176734924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05582878738641739, + "rewards/margins": 0.006568485405296087, + "rewards/rejected": -0.062397271394729614, + "sft_loss": 1.1165757179260254, + "step": 350 + }, + { + "epoch": 0.284, + "grad_norm": 13.020714692622663, + "learning_rate": 4.7333333333333335e-06, + "logits/chosen": -0.4439846873283386, + "logits/rejected": -0.48574358224868774, + "logps/chosen": -1.4092596769332886, + "logps/rejected": -1.8674914836883545, + "loss": 1.1975, + "odds_ratio_loss": 0.6004700064659119, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07046298682689667, + "rewards/margins": 0.022911589592695236, + "rewards/rejected": -0.0933745726943016, + "sft_loss": 1.4092596769332886, + "step": 355 + }, + { + "epoch": 0.288, + "grad_norm": 11.400235583060827, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -0.3304193615913391, + "logits/rejected": -0.539570689201355, + "logps/chosen": -1.2983877658843994, + "logps/rejected": -1.2585299015045166, + "loss": 1.2775, + "odds_ratio_loss": 0.7338186502456665, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06491939723491669, + "rewards/margins": -0.0019928955007344484, + "rewards/rejected": -0.06292649358510971, + "sft_loss": 1.2983877658843994, + "step": 360 + }, + { + "epoch": 0.292, + "grad_norm": 8.196409587525967, + "learning_rate": 4.866666666666667e-06, + "logits/chosen": -0.4542488157749176, + "logits/rejected": -0.49449652433395386, + "logps/chosen": -1.022592544555664, + "logps/rejected": -1.0563173294067383, + "loss": 1.3053, + "odds_ratio_loss": 0.7796744704246521, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05112962797284126, + "rewards/margins": 0.0016862439224496484, + "rewards/rejected": -0.05281587317585945, + "sft_loss": 1.022592544555664, + "step": 365 + }, + { + "epoch": 0.296, + "grad_norm": 10.509812724188748, + "learning_rate": 4.933333333333334e-06, + "logits/chosen": -0.46988096833229065, + "logits/rejected": -0.31580036878585815, + "logps/chosen": -1.3959112167358398, + "logps/rejected": -1.6540298461914062, + "loss": 1.2666, + "odds_ratio_loss": 0.5971593856811523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06979555636644363, + "rewards/margins": 0.012905935756862164, + "rewards/rejected": -0.08270148932933807, + "sft_loss": 1.3959112167358398, + "step": 370 + }, + { + "epoch": 0.3, + "grad_norm": 6.579280238314472, + "learning_rate": 5e-06, + "logits/chosen": -0.3907073736190796, + "logits/rejected": -0.5983395576477051, + "logps/chosen": -1.2141754627227783, + "logps/rejected": -1.3564534187316895, + "loss": 1.2967, + "odds_ratio_loss": 0.6791958808898926, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06070876866579056, + "rewards/margins": 0.007113890256732702, + "rewards/rejected": -0.06782267242670059, + "sft_loss": 1.2141754627227783, + "step": 375 + }, + { + "epoch": 0.304, + "grad_norm": 8.913119169147508, + "learning_rate": 4.999972922944898e-06, + "logits/chosen": -0.7129698395729065, + "logits/rejected": -0.48502880334854126, + "logps/chosen": -1.3781096935272217, + "logps/rejected": -1.5335767269134521, + "loss": 1.3143, + "odds_ratio_loss": 0.6898760795593262, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06890548765659332, + "rewards/margins": 0.007773351855576038, + "rewards/rejected": -0.07667883485555649, + "sft_loss": 1.3781096935272217, + "step": 380 + }, + { + "epoch": 0.308, + "grad_norm": 8.218029726787362, + "learning_rate": 4.999891692366121e-06, + "logits/chosen": -0.26474082469940186, + "logits/rejected": -0.49545183777809143, + "logps/chosen": -0.9910273551940918, + "logps/rejected": -0.944848895072937, + "loss": 1.2702, + "odds_ratio_loss": 0.7946783304214478, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.04955137148499489, + "rewards/margins": -0.0023089235182851553, + "rewards/rejected": -0.04724244400858879, + "sft_loss": 0.9910273551940918, + "step": 385 + }, + { + "epoch": 0.312, + "grad_norm": 7.274203044505213, + "learning_rate": 4.999756310023261e-06, + "logits/chosen": -0.2605069875717163, + "logits/rejected": -0.868638813495636, + "logps/chosen": -1.2463948726654053, + "logps/rejected": -1.2507517337799072, + "loss": 1.3006, + "odds_ratio_loss": 0.741990327835083, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06231974810361862, + "rewards/margins": 0.00021784081764053553, + "rewards/rejected": -0.06253758817911148, + "sft_loss": 1.2463948726654053, + "step": 390 + }, + { + "epoch": 0.316, + "grad_norm": 7.536549818073618, + "learning_rate": 4.99956677884892e-06, + "logits/chosen": -0.9248331785202026, + "logits/rejected": -0.4915364384651184, + "logps/chosen": -1.2074693441390991, + "logps/rejected": -1.51399827003479, + "loss": 1.2689, + "odds_ratio_loss": 0.590670645236969, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.060373466461896896, + "rewards/margins": 0.015326438471674919, + "rewards/rejected": -0.07569991052150726, + "sft_loss": 1.2074693441390991, + "step": 395 + }, + { + "epoch": 0.32, + "grad_norm": 7.302200763333058, + "learning_rate": 4.999323102948655e-06, + "logits/chosen": -1.0176507234573364, + "logits/rejected": -0.4075535833835602, + "logps/chosen": -1.2042930126190186, + "logps/rejected": -1.1218703985214233, + "loss": 1.3829, + "odds_ratio_loss": 0.8366869688034058, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06021466106176376, + "rewards/margins": -0.004121133126318455, + "rewards/rejected": -0.056093525141477585, + "sft_loss": 1.2042930126190186, + "step": 400 + }, + { + "epoch": 0.324, + "grad_norm": 9.822662210243099, + "learning_rate": 4.999025287600886e-06, + "logits/chosen": -0.6442962884902954, + "logits/rejected": -1.0240824222564697, + "logps/chosen": -1.1904699802398682, + "logps/rejected": -1.3809891939163208, + "loss": 1.192, + "odds_ratio_loss": 0.5962144732475281, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.059523504227399826, + "rewards/margins": 0.009525952860713005, + "rewards/rejected": -0.06904946267604828, + "sft_loss": 1.1904699802398682, + "step": 405 + }, + { + "epoch": 0.328, + "grad_norm": 7.990885197694433, + "learning_rate": 4.998673339256785e-06, + "logits/chosen": -0.47288981080055237, + "logits/rejected": -1.0867726802825928, + "logps/chosen": -1.395422101020813, + "logps/rejected": -1.6224483251571655, + "loss": 1.4465, + "odds_ratio_loss": 0.728722870349884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06977111101150513, + "rewards/margins": 0.011351310648024082, + "rewards/rejected": -0.08112241327762604, + "sft_loss": 1.395422101020813, + "step": 410 + }, + { + "epoch": 0.332, + "grad_norm": 22.49371675260937, + "learning_rate": 4.99826726554013e-06, + "logits/chosen": -0.7810731530189514, + "logits/rejected": -0.6283494234085083, + "logps/chosen": -1.206227421760559, + "logps/rejected": -1.0297901630401611, + "loss": 1.3452, + "odds_ratio_loss": 0.9216625094413757, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06031137704849243, + "rewards/margins": -0.008821866475045681, + "rewards/rejected": -0.051489509642124176, + "sft_loss": 1.206227421760559, + "step": 415 + }, + { + "epoch": 0.336, + "grad_norm": 7.038901075406193, + "learning_rate": 4.997807075247147e-06, + "logits/chosen": -0.4292398989200592, + "logits/rejected": -0.7970374226570129, + "logps/chosen": -1.2103397846221924, + "logps/rejected": -1.492582082748413, + "loss": 1.301, + "odds_ratio_loss": 0.5660519003868103, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06051699072122574, + "rewards/margins": 0.014112117700278759, + "rewards/rejected": -0.07462911307811737, + "sft_loss": 1.2103397846221924, + "step": 420 + }, + { + "epoch": 0.34, + "grad_norm": 13.51899355096485, + "learning_rate": 4.997292778346312e-06, + "logits/chosen": -0.3543488681316376, + "logits/rejected": -0.5476067662239075, + "logps/chosen": -1.12038254737854, + "logps/rejected": -1.4835479259490967, + "loss": 1.2928, + "odds_ratio_loss": 0.621757984161377, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.056019127368927, + "rewards/margins": 0.018158262595534325, + "rewards/rejected": -0.07417739927768707, + "sft_loss": 1.12038254737854, + "step": 425 + }, + { + "epoch": 0.344, + "grad_norm": 18.36630863667148, + "learning_rate": 4.996724385978142e-06, + "logits/chosen": -0.6492398977279663, + "logits/rejected": -0.38872796297073364, + "logps/chosen": -1.6991106271743774, + "logps/rejected": -1.4718869924545288, + "loss": 1.3601, + "odds_ratio_loss": 1.0099233388900757, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08495552837848663, + "rewards/margins": -0.011361182667315006, + "rewards/rejected": -0.0735943466424942, + "sft_loss": 1.6991106271743774, + "step": 430 + }, + { + "epoch": 0.348, + "grad_norm": 13.64259911410447, + "learning_rate": 4.996101910454953e-06, + "logits/chosen": -0.5003396272659302, + "logits/rejected": -0.9429534673690796, + "logps/chosen": -1.7060340642929077, + "logps/rejected": -1.4147275686264038, + "loss": 1.3717, + "odds_ratio_loss": 0.9737583994865417, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0853017121553421, + "rewards/margins": -0.014565333724021912, + "rewards/rejected": -0.07073638588190079, + "sft_loss": 1.7060340642929077, + "step": 435 + }, + { + "epoch": 0.352, + "grad_norm": 14.592917079690519, + "learning_rate": 4.995425365260585e-06, + "logits/chosen": -0.40251150727272034, + "logits/rejected": -0.9722174406051636, + "logps/chosen": -1.0912220478057861, + "logps/rejected": -1.363419771194458, + "loss": 1.2888, + "odds_ratio_loss": 0.5721346735954285, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05456110090017319, + "rewards/margins": 0.01360989362001419, + "rewards/rejected": -0.06817099452018738, + "sft_loss": 1.0912220478057861, + "step": 440 + }, + { + "epoch": 0.356, + "grad_norm": 6.937137874320563, + "learning_rate": 4.994694765050121e-06, + "logits/chosen": -0.8574808835983276, + "logits/rejected": -0.7796967625617981, + "logps/chosen": -1.3558590412139893, + "logps/rejected": -1.6645358800888062, + "loss": 1.2991, + "odds_ratio_loss": 0.5890018939971924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06779295206069946, + "rewards/margins": 0.015433847904205322, + "rewards/rejected": -0.08322679251432419, + "sft_loss": 1.3558590412139893, + "step": 445 + }, + { + "epoch": 0.36, + "grad_norm": 13.634295128279398, + "learning_rate": 4.993910125649561e-06, + "logits/chosen": -0.35561490058898926, + "logits/rejected": -0.9158598184585571, + "logps/chosen": -1.2657678127288818, + "logps/rejected": -1.4462788105010986, + "loss": 1.3292, + "odds_ratio_loss": 0.7243040800094604, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06328839808702469, + "rewards/margins": 0.009025548584759235, + "rewards/rejected": -0.07231394946575165, + "sft_loss": 1.2657678127288818, + "step": 450 + }, + { + "epoch": 0.364, + "grad_norm": 15.588418308947864, + "learning_rate": 4.993071464055486e-06, + "logits/chosen": -0.7908333539962769, + "logits/rejected": -0.9238386154174805, + "logps/chosen": -1.464706301689148, + "logps/rejected": -1.4646415710449219, + "loss": 1.3062, + "odds_ratio_loss": 0.7673660516738892, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07323531806468964, + "rewards/margins": -3.232434437450138e-06, + "rewards/rejected": -0.07323208451271057, + "sft_loss": 1.464706301689148, + "step": 455 + }, + { + "epoch": 0.368, + "grad_norm": 7.67023021612323, + "learning_rate": 4.992178798434684e-06, + "logits/chosen": -0.46548470854759216, + "logits/rejected": -1.2474113702774048, + "logps/chosen": -0.9748779535293579, + "logps/rejected": -1.329563856124878, + "loss": 1.2079, + "odds_ratio_loss": 0.5502606630325317, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.048743896186351776, + "rewards/margins": 0.01773429848253727, + "rewards/rejected": -0.0664781928062439, + "sft_loss": 0.9748779535293579, + "step": 460 + }, + { + "epoch": 0.372, + "grad_norm": 9.396021890858217, + "learning_rate": 4.9912321481237616e-06, + "logits/chosen": -0.3827522397041321, + "logits/rejected": -0.7908186912536621, + "logps/chosen": -1.2393275499343872, + "logps/rejected": -1.1069507598876953, + "loss": 1.1528, + "odds_ratio_loss": 0.8488165140151978, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06196638196706772, + "rewards/margins": -0.00661883782595396, + "rewards/rejected": -0.055347543209791183, + "sft_loss": 1.2393275499343872, + "step": 465 + }, + { + "epoch": 0.376, + "grad_norm": 15.667684557323762, + "learning_rate": 4.990231533628719e-06, + "logits/chosen": -0.7983174920082092, + "logits/rejected": -0.942952036857605, + "logps/chosen": -1.61797297000885, + "logps/rejected": -1.5470950603485107, + "loss": 1.3583, + "odds_ratio_loss": 0.9691923260688782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08089864999055862, + "rewards/margins": -0.003543901490047574, + "rewards/rejected": -0.07735475152730942, + "sft_loss": 1.61797297000885, + "step": 470 + }, + { + "epoch": 0.38, + "grad_norm": 10.262155313138615, + "learning_rate": 4.989176976624511e-06, + "logits/chosen": -0.5470207929611206, + "logits/rejected": -0.8008524179458618, + "logps/chosen": -1.2335219383239746, + "logps/rejected": -1.537672758102417, + "loss": 1.3457, + "odds_ratio_loss": 0.6203063130378723, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06167609617114067, + "rewards/margins": 0.015207541175186634, + "rewards/rejected": -0.07688363641500473, + "sft_loss": 1.2335219383239746, + "step": 475 + }, + { + "epoch": 0.384, + "grad_norm": 8.161147591651558, + "learning_rate": 4.988068499954578e-06, + "logits/chosen": -0.3918524384498596, + "logits/rejected": -0.5939075946807861, + "logps/chosen": -1.3265020847320557, + "logps/rejected": -1.2360165119171143, + "loss": 1.2193, + "odds_ratio_loss": 0.7953775525093079, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.0663251057267189, + "rewards/margins": -0.004524278454482555, + "rewards/rejected": -0.06180082634091377, + "sft_loss": 1.3265020847320557, + "step": 480 + }, + { + "epoch": 0.388, + "grad_norm": 7.500330838134909, + "learning_rate": 4.986906127630346e-06, + "logits/chosen": -0.4039885401725769, + "logits/rejected": -0.5603979825973511, + "logps/chosen": -1.1305885314941406, + "logps/rejected": -1.2077453136444092, + "loss": 1.31, + "odds_ratio_loss": 0.710327684879303, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05652942508459091, + "rewards/margins": 0.0038578410167247057, + "rewards/rejected": -0.0603872649371624, + "sft_loss": 1.1305885314941406, + "step": 485 + }, + { + "epoch": 0.392, + "grad_norm": 7.430782365855567, + "learning_rate": 4.985689884830711e-06, + "logits/chosen": -0.5145419836044312, + "logits/rejected": -0.5283325910568237, + "logps/chosen": -0.8873690366744995, + "logps/rejected": -1.0988765954971313, + "loss": 1.2163, + "odds_ratio_loss": 0.5778387784957886, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.044368453323841095, + "rewards/margins": 0.010575374588370323, + "rewards/rejected": -0.05494382977485657, + "sft_loss": 0.8873690366744995, + "step": 490 + }, + { + "epoch": 0.396, + "grad_norm": 7.833399890164202, + "learning_rate": 4.984419797901491e-06, + "logits/chosen": -0.6590821743011475, + "logits/rejected": -0.7595802545547485, + "logps/chosen": -1.2602180242538452, + "logps/rejected": -1.3176265954971313, + "loss": 1.2691, + "odds_ratio_loss": 0.8493593335151672, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06301090866327286, + "rewards/margins": 0.0028704279102385044, + "rewards/rejected": -0.06588132679462433, + "sft_loss": 1.2602180242538452, + "step": 495 + }, + { + "epoch": 0.4, + "grad_norm": 18.632214535791704, + "learning_rate": 4.983095894354858e-06, + "logits/chosen": -0.4741068482398987, + "logits/rejected": -0.6550690531730652, + "logps/chosen": -1.0877597332000732, + "logps/rejected": -1.4105587005615234, + "loss": 1.2655, + "odds_ratio_loss": 0.6428815126419067, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05438799411058426, + "rewards/margins": 0.01613995060324669, + "rewards/rejected": -0.07052794098854065, + "sft_loss": 1.0877597332000732, + "step": 500 + }, + { + "epoch": 0.404, + "grad_norm": 13.093628531972609, + "learning_rate": 4.981718202868738e-06, + "logits/chosen": -0.37520939111709595, + "logits/rejected": -0.5333417654037476, + "logps/chosen": -1.2307997941970825, + "logps/rejected": -1.3670756816864014, + "loss": 1.3233, + "odds_ratio_loss": 0.7391397356987, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06153998523950577, + "rewards/margins": 0.006813795771449804, + "rewards/rejected": -0.06835378706455231, + "sft_loss": 1.2307997941970825, + "step": 505 + }, + { + "epoch": 0.408, + "grad_norm": 32.02171447844317, + "learning_rate": 4.980286753286196e-06, + "logits/chosen": -1.0221941471099854, + "logits/rejected": -0.6870493292808533, + "logps/chosen": -1.1637510061264038, + "logps/rejected": -1.1249791383743286, + "loss": 1.2777, + "odds_ratio_loss": 0.7576397061347961, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05818755179643631, + "rewards/margins": -0.0019385985797271132, + "rewards/rejected": -0.05624895542860031, + "sft_loss": 1.1637510061264038, + "step": 510 + }, + { + "epoch": 0.412, + "grad_norm": 8.922962337548432, + "learning_rate": 4.978801576614779e-06, + "logits/chosen": -0.5233002305030823, + "logits/rejected": -0.7131946086883545, + "logps/chosen": -1.1369407176971436, + "logps/rejected": -1.2576231956481934, + "loss": 1.3731, + "odds_ratio_loss": 0.630257248878479, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05684703588485718, + "rewards/margins": 0.00603412976488471, + "rewards/rejected": -0.06288117170333862, + "sft_loss": 1.1369407176971436, + "step": 515 + }, + { + "epoch": 0.416, + "grad_norm": 6.071002612452945, + "learning_rate": 4.97726270502586e-06, + "logits/chosen": -0.751970112323761, + "logits/rejected": -0.5403101444244385, + "logps/chosen": -1.125468373298645, + "logps/rejected": -1.4324285984039307, + "loss": 1.2763, + "odds_ratio_loss": 0.6366511583328247, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05627341940999031, + "rewards/margins": 0.015348007902503014, + "rewards/rejected": -0.07162143290042877, + "sft_loss": 1.125468373298645, + "step": 520 + }, + { + "epoch": 0.42, + "grad_norm": 19.360606964673867, + "learning_rate": 4.975670171853926e-06, + "logits/chosen": -0.6735143065452576, + "logits/rejected": -0.5386877059936523, + "logps/chosen": -1.3962455987930298, + "logps/rejected": -1.3698430061340332, + "loss": 1.3162, + "odds_ratio_loss": 0.7852639555931091, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06981228291988373, + "rewards/margins": -0.0013201329857110977, + "rewards/rejected": -0.06849215179681778, + "sft_loss": 1.3962455987930298, + "step": 525 + }, + { + "epoch": 0.424, + "grad_norm": 12.60459466219797, + "learning_rate": 4.974024011595864e-06, + "logits/chosen": -0.726483941078186, + "logits/rejected": -0.45278626680374146, + "logps/chosen": -1.1809227466583252, + "logps/rejected": -1.4415466785430908, + "loss": 1.2782, + "odds_ratio_loss": 0.5937660932540894, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05904613807797432, + "rewards/margins": 0.013031196780502796, + "rewards/rejected": -0.07207732647657394, + "sft_loss": 1.1809227466583252, + "step": 530 + }, + { + "epoch": 0.428, + "grad_norm": 16.735047153184535, + "learning_rate": 4.97232425991021e-06, + "logits/chosen": -0.7554991245269775, + "logits/rejected": -0.6952065825462341, + "logps/chosen": -1.2135225534439087, + "logps/rejected": -1.2788511514663696, + "loss": 1.2871, + "odds_ratio_loss": 0.6615133881568909, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.060676127672195435, + "rewards/margins": 0.0032664339523762465, + "rewards/rejected": -0.0639425590634346, + "sft_loss": 1.2135225534439087, + "step": 535 + }, + { + "epoch": 0.432, + "grad_norm": 28.636993191829472, + "learning_rate": 4.970570953616383e-06, + "logits/chosen": -0.3668878972530365, + "logits/rejected": -0.47576600313186646, + "logps/chosen": -1.254703402519226, + "logps/rejected": -1.4109866619110107, + "loss": 1.3499, + "odds_ratio_loss": 0.6886258125305176, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0627351701259613, + "rewards/margins": 0.007814161479473114, + "rewards/rejected": -0.07054933160543442, + "sft_loss": 1.254703402519226, + "step": 540 + }, + { + "epoch": 0.436, + "grad_norm": 8.20123877269534, + "learning_rate": 4.9687641306938766e-06, + "logits/chosen": -0.596192479133606, + "logits/rejected": -0.34622710943222046, + "logps/chosen": -1.258967399597168, + "logps/rejected": -1.168402910232544, + "loss": 1.2366, + "odds_ratio_loss": 0.9441742897033691, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06294836848974228, + "rewards/margins": -0.004528213292360306, + "rewards/rejected": -0.058420151472091675, + "sft_loss": 1.258967399597168, + "step": 545 + }, + { + "epoch": 0.44, + "grad_norm": 19.0599247063106, + "learning_rate": 4.966903830281449e-06, + "logits/chosen": -0.550312876701355, + "logits/rejected": -0.7260756492614746, + "logps/chosen": -0.8798558115959167, + "logps/rejected": -0.9841960072517395, + "loss": 1.2888, + "odds_ratio_loss": 0.6650792360305786, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.043992795050144196, + "rewards/margins": 0.00521700968965888, + "rewards/rejected": -0.049209803342819214, + "sft_loss": 0.8798558115959167, + "step": 550 + }, + { + "epoch": 0.444, + "grad_norm": 9.539583052659502, + "learning_rate": 4.964990092676263e-06, + "logits/chosen": -0.5485479235649109, + "logits/rejected": -0.2632920444011688, + "logps/chosen": -1.2334239482879639, + "logps/rejected": -1.2576792240142822, + "loss": 1.1946, + "odds_ratio_loss": 0.7366870641708374, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06167120486497879, + "rewards/margins": 0.0012127698864787817, + "rewards/rejected": -0.06288396567106247, + "sft_loss": 1.2334239482879639, + "step": 555 + }, + { + "epoch": 0.448, + "grad_norm": 6.575286386431191, + "learning_rate": 4.9630229593330226e-06, + "logits/chosen": -0.5613077878952026, + "logits/rejected": -0.9460631608963013, + "logps/chosen": -1.4012134075164795, + "logps/rejected": -1.474373698234558, + "loss": 1.356, + "odds_ratio_loss": 0.7623913884162903, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07006067782640457, + "rewards/margins": 0.0036580085288733244, + "rewards/rejected": -0.07371868193149567, + "sft_loss": 1.4012134075164795, + "step": 560 + }, + { + "epoch": 0.452, + "grad_norm": 8.260756058648278, + "learning_rate": 4.96100247286307e-06, + "logits/chosen": -0.3762677311897278, + "logits/rejected": -0.681728184223175, + "logps/chosen": -1.2107107639312744, + "logps/rejected": -1.336549162864685, + "loss": 1.2595, + "odds_ratio_loss": 0.6657492518424988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06053553894162178, + "rewards/margins": 0.006291926838457584, + "rewards/rejected": -0.06682746857404709, + "sft_loss": 1.2107107639312744, + "step": 565 + }, + { + "epoch": 0.456, + "grad_norm": 5.224482900966935, + "learning_rate": 4.958928677033465e-06, + "logits/chosen": -0.5366466045379639, + "logits/rejected": -0.2813822627067566, + "logps/chosen": -1.0963077545166016, + "logps/rejected": -1.9244539737701416, + "loss": 1.1666, + "odds_ratio_loss": 0.4927639961242676, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0548153892159462, + "rewards/margins": 0.04140730947256088, + "rewards/rejected": -0.09622270613908768, + "sft_loss": 1.0963077545166016, + "step": 570 + }, + { + "epoch": 0.46, + "grad_norm": 9.231089411843929, + "learning_rate": 4.956801616766033e-06, + "logits/chosen": -0.41479817032814026, + "logits/rejected": -1.2379553318023682, + "logps/chosen": -1.3129132986068726, + "logps/rejected": -1.4864296913146973, + "loss": 1.2148, + "odds_ratio_loss": 0.6478667259216309, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06564567983150482, + "rewards/margins": 0.008675819262862206, + "rewards/rejected": -0.07432149350643158, + "sft_loss": 1.3129132986068726, + "step": 575 + }, + { + "epoch": 0.464, + "grad_norm": 12.571799699194248, + "learning_rate": 4.954621338136399e-06, + "logits/chosen": -0.43359094858169556, + "logits/rejected": -0.7193197011947632, + "logps/chosen": -1.1483484506607056, + "logps/rejected": -1.4485629796981812, + "loss": 1.217, + "odds_ratio_loss": 0.5974001288414001, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05741741508245468, + "rewards/margins": 0.015010732226073742, + "rewards/rejected": -0.0724281519651413, + "sft_loss": 1.1483484506607056, + "step": 580 + }, + { + "epoch": 0.468, + "grad_norm": 13.14113848877051, + "learning_rate": 4.9523878883729794e-06, + "logits/chosen": -0.3140491545200348, + "logits/rejected": -1.1171773672103882, + "logps/chosen": -1.1811208724975586, + "logps/rejected": -1.2877556085586548, + "loss": 1.3736, + "odds_ratio_loss": 0.6588929295539856, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05905604362487793, + "rewards/margins": 0.005331738851964474, + "rewards/rejected": -0.06438778340816498, + "sft_loss": 1.1811208724975586, + "step": 585 + }, + { + "epoch": 0.472, + "grad_norm": 6.741763650848641, + "learning_rate": 4.95010131585597e-06, + "logits/chosen": -0.41005969047546387, + "logits/rejected": -1.1937333345413208, + "logps/chosen": -1.1939094066619873, + "logps/rejected": -1.519858717918396, + "loss": 1.3015, + "odds_ratio_loss": 0.5203220844268799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05969547480344772, + "rewards/margins": 0.016297463327646255, + "rewards/rejected": -0.07599293440580368, + "sft_loss": 1.1939094066619873, + "step": 590 + }, + { + "epoch": 0.476, + "grad_norm": 11.415912203136902, + "learning_rate": 4.94776167011629e-06, + "logits/chosen": -0.8332249522209167, + "logits/rejected": -1.193943738937378, + "logps/chosen": -1.5026148557662964, + "logps/rejected": -1.3563969135284424, + "loss": 1.318, + "odds_ratio_loss": 0.8462100028991699, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.07513075321912766, + "rewards/margins": -0.007310903165489435, + "rewards/rejected": -0.06781984865665436, + "sft_loss": 1.5026148557662964, + "step": 595 + }, + { + "epoch": 0.48, + "grad_norm": 6.517733955264292, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": -0.7536684274673462, + "logits/rejected": -0.5858591794967651, + "logps/chosen": -1.0818665027618408, + "logps/rejected": -0.9140428304672241, + "loss": 1.2555, + "odds_ratio_loss": 0.847333550453186, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05409332364797592, + "rewards/margins": -0.008391184732317924, + "rewards/rejected": -0.045702140778303146, + "sft_loss": 1.0818665027618408, + "step": 600 + }, + { + "epoch": 0.484, + "grad_norm": 11.605525874938747, + "learning_rate": 4.94292336283977e-06, + "logits/chosen": -0.4545044004917145, + "logits/rejected": -0.9466699361801147, + "logps/chosen": -1.3461689949035645, + "logps/rejected": -1.257264256477356, + "loss": 1.2515, + "odds_ratio_loss": 0.7972198724746704, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.0673084557056427, + "rewards/margins": -0.0044452352449297905, + "rewards/rejected": -0.06286321580410004, + "sft_loss": 1.3461689949035645, + "step": 605 + }, + { + "epoch": 0.488, + "grad_norm": 16.623733933538222, + "learning_rate": 4.940424806108619e-06, + "logits/chosen": -0.44307154417037964, + "logits/rejected": -1.1300188302993774, + "logps/chosen": -1.5469882488250732, + "logps/rejected": -1.546287178993225, + "loss": 1.296, + "odds_ratio_loss": 0.7446098923683167, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07734941691160202, + "rewards/margins": -3.506392386043444e-05, + "rewards/rejected": -0.0773143544793129, + "sft_loss": 1.5469882488250732, + "step": 610 + }, + { + "epoch": 0.492, + "grad_norm": 14.50351733733393, + "learning_rate": 4.937873385763909e-06, + "logits/chosen": -0.4979380667209625, + "logits/rejected": -0.7332254648208618, + "logps/chosen": -1.186477780342102, + "logps/rejected": -1.4961540699005127, + "loss": 1.2214, + "odds_ratio_loss": 0.5873562097549438, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05932389572262764, + "rewards/margins": 0.015483811497688293, + "rewards/rejected": -0.07480770349502563, + "sft_loss": 1.186477780342102, + "step": 615 + }, + { + "epoch": 0.496, + "grad_norm": 10.189101808498776, + "learning_rate": 4.935269157073597e-06, + "logits/chosen": -0.30127787590026855, + "logits/rejected": -0.2239534556865692, + "logps/chosen": -1.0941071510314941, + "logps/rejected": -1.155369520187378, + "loss": 1.2658, + "odds_ratio_loss": 0.7786355018615723, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.054705359041690826, + "rewards/margins": 0.0030631190165877342, + "rewards/rejected": -0.057768482714891434, + "sft_loss": 1.0941071510314941, + "step": 620 + }, + { + "epoch": 0.5, + "grad_norm": 6.717391051471845, + "learning_rate": 4.93261217644956e-06, + "logits/chosen": -0.6986908316612244, + "logits/rejected": -0.6756628751754761, + "logps/chosen": -1.0286588668823242, + "logps/rejected": -1.368683099746704, + "loss": 1.2814, + "odds_ratio_loss": 0.5862299203872681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05143294483423233, + "rewards/margins": 0.017001213505864143, + "rewards/rejected": -0.06843416392803192, + "sft_loss": 1.0286588668823242, + "step": 625 + }, + { + "epoch": 0.504, + "grad_norm": 4.84256770780635, + "learning_rate": 4.9299025014463665e-06, + "logits/chosen": -0.6707471609115601, + "logits/rejected": -0.4917599558830261, + "logps/chosen": -1.401749849319458, + "logps/rejected": -1.291032075881958, + "loss": 1.2497, + "odds_ratio_loss": 0.8192909359931946, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0700874850153923, + "rewards/margins": -0.005535887088626623, + "rewards/rejected": -0.06455160677433014, + "sft_loss": 1.401749849319458, + "step": 630 + }, + { + "epoch": 0.508, + "grad_norm": 5.259599717240724, + "learning_rate": 4.92714019076003e-06, + "logits/chosen": -0.6145176291465759, + "logits/rejected": -0.6850901246070862, + "logps/chosen": -1.1875053644180298, + "logps/rejected": -1.5674359798431396, + "loss": 1.2434, + "odds_ratio_loss": 0.4647662043571472, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.059375274926424026, + "rewards/margins": 0.018996523693203926, + "rewards/rejected": -0.0783717930316925, + "sft_loss": 1.1875053644180298, + "step": 635 + }, + { + "epoch": 0.512, + "grad_norm": 8.97887669520296, + "learning_rate": 4.924325304226745e-06, + "logits/chosen": -0.509232223033905, + "logits/rejected": -0.804535984992981, + "logps/chosen": -0.9804172515869141, + "logps/rejected": -1.4311110973358154, + "loss": 1.2258, + "odds_ratio_loss": 0.47105565667152405, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04902086406946182, + "rewards/margins": 0.022534683346748352, + "rewards/rejected": -0.07155554741621017, + "sft_loss": 0.9804172515869141, + "step": 640 + }, + { + "epoch": 0.516, + "grad_norm": 12.339274811909346, + "learning_rate": 4.921457902821578e-06, + "logits/chosen": -0.6703714728355408, + "logits/rejected": -0.7211524844169617, + "logps/chosen": -1.412858247756958, + "logps/rejected": -1.7014133930206299, + "loss": 1.2609, + "odds_ratio_loss": 0.6318767666816711, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07064291089773178, + "rewards/margins": 0.014427749440073967, + "rewards/rejected": -0.0850706622004509, + "sft_loss": 1.412858247756958, + "step": 645 + }, + { + "epoch": 0.52, + "grad_norm": 6.902631133651239, + "learning_rate": 4.91853804865716e-06, + "logits/chosen": -0.423556387424469, + "logits/rejected": -0.6117393970489502, + "logps/chosen": -0.9919289350509644, + "logps/rejected": -1.5979692935943604, + "loss": 1.2347, + "odds_ratio_loss": 0.4265903830528259, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.049596451222896576, + "rewards/margins": 0.0303020179271698, + "rewards/rejected": -0.07989846915006638, + "sft_loss": 0.9919289350509644, + "step": 650 + }, + { + "epoch": 0.524, + "grad_norm": 20.833564963024443, + "learning_rate": 4.915565804982332e-06, + "logits/chosen": -0.6421648263931274, + "logits/rejected": -0.2954399287700653, + "logps/chosen": -1.2931578159332275, + "logps/rejected": -1.3003164529800415, + "loss": 1.2488, + "odds_ratio_loss": 0.7667359113693237, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06465788185596466, + "rewards/margins": 0.0003579244075808674, + "rewards/rejected": -0.06501581519842148, + "sft_loss": 1.2931578159332275, + "step": 655 + }, + { + "epoch": 0.528, + "grad_norm": 5.965871281558726, + "learning_rate": 4.912541236180779e-06, + "logits/chosen": -0.6520070433616638, + "logits/rejected": -0.7092788815498352, + "logps/chosen": -1.151734471321106, + "logps/rejected": -1.186388611793518, + "loss": 1.1441, + "odds_ratio_loss": 0.7639530301094055, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05758672207593918, + "rewards/margins": 0.0017327029490843415, + "rewards/rejected": -0.059319429099559784, + "sft_loss": 1.151734471321106, + "step": 660 + }, + { + "epoch": 0.532, + "grad_norm": 6.664910493167132, + "learning_rate": 4.909464407769633e-06, + "logits/chosen": -0.5549234747886658, + "logits/rejected": -0.8910662531852722, + "logps/chosen": -1.2270019054412842, + "logps/rejected": -1.200127363204956, + "loss": 1.23, + "odds_ratio_loss": 0.8013579249382019, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06135009601712227, + "rewards/margins": -0.0013437264133244753, + "rewards/rejected": -0.060006361454725266, + "sft_loss": 1.2270019054412842, + "step": 665 + }, + { + "epoch": 0.536, + "grad_norm": 5.1055719510419335, + "learning_rate": 4.9063353863980565e-06, + "logits/chosen": -0.4341130256652832, + "logits/rejected": -0.791808545589447, + "logps/chosen": -1.1596591472625732, + "logps/rejected": -1.5821669101715088, + "loss": 1.2846, + "odds_ratio_loss": 0.5709139108657837, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05798295885324478, + "rewards/margins": 0.021125389263033867, + "rewards/rejected": -0.0791083499789238, + "sft_loss": 1.1596591472625732, + "step": 670 + }, + { + "epoch": 0.54, + "grad_norm": 7.5751885145020506, + "learning_rate": 4.903154239845798e-06, + "logits/chosen": -0.4769849181175232, + "logits/rejected": -0.5648764967918396, + "logps/chosen": -1.1110217571258545, + "logps/rejected": -1.4287192821502686, + "loss": 1.2388, + "odds_ratio_loss": 0.5047262907028198, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05555109307169914, + "rewards/margins": 0.015884879976511, + "rewards/rejected": -0.07143596559762955, + "sft_loss": 1.1110217571258545, + "step": 675 + }, + { + "epoch": 0.544, + "grad_norm": 7.921191214189487, + "learning_rate": 4.899921037021719e-06, + "logits/chosen": -0.36087316274642944, + "logits/rejected": -0.1616624891757965, + "logps/chosen": -1.1064417362213135, + "logps/rejected": -0.9575250744819641, + "loss": 1.2096, + "odds_ratio_loss": 0.8907485008239746, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.05532208830118179, + "rewards/margins": -0.007445829920470715, + "rewards/rejected": -0.047876257449388504, + "sft_loss": 1.1064417362213135, + "step": 680 + }, + { + "epoch": 0.548, + "grad_norm": 12.574180795892904, + "learning_rate": 4.896635847962311e-06, + "logits/chosen": -0.6308740973472595, + "logits/rejected": -0.6598986387252808, + "logps/chosen": -1.0816127061843872, + "logps/rejected": -1.5012505054473877, + "loss": 1.2037, + "odds_ratio_loss": 0.6682350635528564, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05408062785863876, + "rewards/margins": 0.02098189666867256, + "rewards/rejected": -0.07506252825260162, + "sft_loss": 1.0816127061843872, + "step": 685 + }, + { + "epoch": 0.552, + "grad_norm": 11.992784752793785, + "learning_rate": 4.893298743830168e-06, + "logits/chosen": -0.44944000244140625, + "logits/rejected": -0.8219810724258423, + "logps/chosen": -1.5772783756256104, + "logps/rejected": -1.2028121948242188, + "loss": 1.2864, + "odds_ratio_loss": 1.1701303720474243, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07886390388011932, + "rewards/margins": -0.018723303452134132, + "rewards/rejected": -0.06014060229063034, + "sft_loss": 1.5772783756256104, + "step": 690 + }, + { + "epoch": 0.556, + "grad_norm": 8.358855854874342, + "learning_rate": 4.889909796912454e-06, + "logits/chosen": -0.28839197754859924, + "logits/rejected": -0.44521206617355347, + "logps/chosen": -1.1581156253814697, + "logps/rejected": -1.1767852306365967, + "loss": 1.2989, + "odds_ratio_loss": 0.6894288063049316, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.057905782014131546, + "rewards/margins": 0.0009334773058071733, + "rewards/rejected": -0.058839261531829834, + "sft_loss": 1.1581156253814697, + "step": 695 + }, + { + "epoch": 0.56, + "grad_norm": 6.831254137005319, + "learning_rate": 4.88646908061933e-06, + "logits/chosen": -0.6353722214698792, + "logits/rejected": -0.7715078592300415, + "logps/chosen": -1.5473369359970093, + "logps/rejected": -1.5499107837677002, + "loss": 1.2584, + "odds_ratio_loss": 0.7936614751815796, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07736685127019882, + "rewards/margins": 0.00012868604972027242, + "rewards/rejected": -0.0774955302476883, + "sft_loss": 1.5473369359970093, + "step": 700 + }, + { + "epoch": 0.564, + "grad_norm": 10.218090713494131, + "learning_rate": 4.882976669482368e-06, + "logits/chosen": -0.6931608319282532, + "logits/rejected": -0.6439999341964722, + "logps/chosen": -1.328382968902588, + "logps/rejected": -1.6422935724258423, + "loss": 1.3709, + "odds_ratio_loss": 0.5778505802154541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06641916185617447, + "rewards/margins": 0.015695523470640182, + "rewards/rejected": -0.08211468160152435, + "sft_loss": 1.328382968902588, + "step": 705 + }, + { + "epoch": 0.568, + "grad_norm": 5.768384836009137, + "learning_rate": 4.879432639152935e-06, + "logits/chosen": -0.5731868147850037, + "logits/rejected": -0.9456470608711243, + "logps/chosen": -0.9817520976066589, + "logps/rejected": -1.3865406513214111, + "loss": 1.2994, + "odds_ratio_loss": 0.48160356283187866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.049087610095739365, + "rewards/margins": 0.02023942954838276, + "rewards/rejected": -0.06932704150676727, + "sft_loss": 0.9817520976066589, + "step": 710 + }, + { + "epoch": 0.572, + "grad_norm": 7.509810654555548, + "learning_rate": 4.875837066400553e-06, + "logits/chosen": -0.691825807094574, + "logits/rejected": -0.5271461606025696, + "logps/chosen": -0.9871689677238464, + "logps/rejected": -1.324852705001831, + "loss": 1.2221, + "odds_ratio_loss": 0.5907098054885864, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04935844987630844, + "rewards/margins": 0.016884196549654007, + "rewards/rejected": -0.06624264270067215, + "sft_loss": 0.9871689677238464, + "step": 715 + }, + { + "epoch": 0.576, + "grad_norm": 24.694839731628768, + "learning_rate": 4.8721900291112415e-06, + "logits/chosen": -0.8862325549125671, + "logits/rejected": -0.5018847584724426, + "logps/chosen": -1.0031644105911255, + "logps/rejected": -1.247604250907898, + "loss": 1.2261, + "odds_ratio_loss": 0.6080331802368164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05015822499990463, + "rewards/margins": 0.012221988290548325, + "rewards/rejected": -0.06238021329045296, + "sft_loss": 1.0031644105911255, + "step": 720 + }, + { + "epoch": 0.58, + "grad_norm": 26.923254409620736, + "learning_rate": 4.868491606285823e-06, + "logits/chosen": -0.5057476162910461, + "logits/rejected": -0.7251291275024414, + "logps/chosen": -1.470099687576294, + "logps/rejected": -1.3734233379364014, + "loss": 1.3296, + "odds_ratio_loss": 0.9267051815986633, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07350499927997589, + "rewards/margins": -0.004833821672946215, + "rewards/rejected": -0.06867116689682007, + "sft_loss": 1.470099687576294, + "step": 725 + }, + { + "epoch": 0.584, + "grad_norm": 7.574149129374884, + "learning_rate": 4.864741878038218e-06, + "logits/chosen": -0.46996521949768066, + "logits/rejected": -0.6176045536994934, + "logps/chosen": -1.454437017440796, + "logps/rejected": -1.3104689121246338, + "loss": 1.2605, + "odds_ratio_loss": 0.8346797823905945, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07272185385227203, + "rewards/margins": -0.007198403589427471, + "rewards/rejected": -0.06552345305681229, + "sft_loss": 1.454437017440796, + "step": 730 + }, + { + "epoch": 0.588, + "grad_norm": 8.305352840446517, + "learning_rate": 4.860940925593703e-06, + "logits/chosen": -0.30859681963920593, + "logits/rejected": -1.0121045112609863, + "logps/chosen": -1.1916425228118896, + "logps/rejected": -1.6442506313323975, + "loss": 1.2909, + "odds_ratio_loss": 0.5468284487724304, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05958213284611702, + "rewards/margins": 0.02263040840625763, + "rewards/rejected": -0.08221253752708435, + "sft_loss": 1.1916425228118896, + "step": 735 + }, + { + "epoch": 0.592, + "grad_norm": 30.679914716790897, + "learning_rate": 4.857088831287158e-06, + "logits/chosen": -0.3152746558189392, + "logits/rejected": -0.498546838760376, + "logps/chosen": -1.2811144590377808, + "logps/rejected": -1.4738355875015259, + "loss": 1.242, + "odds_ratio_loss": 0.6398525834083557, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06405572593212128, + "rewards/margins": 0.009636052884161472, + "rewards/rejected": -0.07369177788496017, + "sft_loss": 1.2811144590377808, + "step": 740 + }, + { + "epoch": 0.596, + "grad_norm": 7.06148057428711, + "learning_rate": 4.85318567856128e-06, + "logits/chosen": -0.6131590604782104, + "logits/rejected": -0.22784848511219025, + "logps/chosen": -1.190834879875183, + "logps/rejected": -1.4315545558929443, + "loss": 1.211, + "odds_ratio_loss": 0.6610680818557739, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.059541743248701096, + "rewards/margins": 0.012035989202558994, + "rewards/rejected": -0.07157773524522781, + "sft_loss": 1.190834879875183, + "step": 745 + }, + { + "epoch": 0.6, + "grad_norm": 10.554654256963408, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -0.30865949392318726, + "logits/rejected": -0.6756810545921326, + "logps/chosen": -0.9886703491210938, + "logps/rejected": -1.524072289466858, + "loss": 1.149, + "odds_ratio_loss": 0.4154701828956604, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.049433521926403046, + "rewards/margins": 0.0267700906842947, + "rewards/rejected": -0.0762036144733429, + "sft_loss": 0.9886703491210938, + "step": 750 + }, + { + "epoch": 0.604, + "grad_norm": 8.5013064131026, + "learning_rate": 4.8452265371505176e-06, + "logits/chosen": -0.3945849537849426, + "logits/rejected": -0.6222106218338013, + "logps/chosen": -1.2622836828231812, + "logps/rejected": -1.3229162693023682, + "loss": 1.2813, + "odds_ratio_loss": 0.7592854499816895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06311418116092682, + "rewards/margins": 0.0030316333286464214, + "rewards/rejected": -0.06614581495523453, + "sft_loss": 1.2622836828231812, + "step": 755 + }, + { + "epoch": 0.608, + "grad_norm": 6.339074653596066, + "learning_rate": 4.841170720873723e-06, + "logits/chosen": -0.6454433798789978, + "logits/rejected": -0.5933985710144043, + "logps/chosen": -1.0739322900772095, + "logps/rejected": -1.2623248100280762, + "loss": 1.2882, + "odds_ratio_loss": 0.6267717480659485, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.053696613758802414, + "rewards/margins": 0.009419633075594902, + "rewards/rejected": -0.06311623752117157, + "sft_loss": 1.0739322900772095, + "step": 760 + }, + { + "epoch": 0.612, + "grad_norm": 9.528245360044645, + "learning_rate": 4.837064190990036e-06, + "logits/chosen": -0.5372971296310425, + "logits/rejected": -0.4304935932159424, + "logps/chosen": -1.3918521404266357, + "logps/rejected": -1.2833397388458252, + "loss": 1.2469, + "odds_ratio_loss": 0.8755296468734741, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06959259510040283, + "rewards/margins": -0.005425615236163139, + "rewards/rejected": -0.06416698545217514, + "sft_loss": 1.3918521404266357, + "step": 765 + }, + { + "epoch": 0.616, + "grad_norm": 45.4719267607946, + "learning_rate": 4.832907036453647e-06, + "logits/chosen": -0.6772319078445435, + "logits/rejected": -0.29380422830581665, + "logps/chosen": -1.3806759119033813, + "logps/rejected": -1.4640812873840332, + "loss": 1.2546, + "odds_ratio_loss": 0.7038690447807312, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06903379410505295, + "rewards/margins": 0.00417026923969388, + "rewards/rejected": -0.07320406287908554, + "sft_loss": 1.3806759119033813, + "step": 770 + }, + { + "epoch": 0.62, + "grad_norm": 6.67487476307786, + "learning_rate": 4.828699347315357e-06, + "logits/chosen": -0.43924084305763245, + "logits/rejected": -0.685263991355896, + "logps/chosen": -1.3941123485565186, + "logps/rejected": -1.5404117107391357, + "loss": 1.3004, + "odds_ratio_loss": 0.6843758821487427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06970562040805817, + "rewards/margins": 0.0073149665258824825, + "rewards/rejected": -0.07702059298753738, + "sft_loss": 1.3941123485565186, + "step": 775 + }, + { + "epoch": 0.624, + "grad_norm": 8.063952217109858, + "learning_rate": 4.824441214720629e-06, + "logits/chosen": -0.4597851634025574, + "logits/rejected": -0.7097247242927551, + "logps/chosen": -1.4838628768920898, + "logps/rejected": -1.552299976348877, + "loss": 1.3045, + "odds_ratio_loss": 0.7198641300201416, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07419314980506897, + "rewards/margins": 0.0034218481741845608, + "rewards/rejected": -0.07761499285697937, + "sft_loss": 1.4838628768920898, + "step": 780 + }, + { + "epoch": 0.628, + "grad_norm": 8.186475314952034, + "learning_rate": 4.8201327309076176e-06, + "logits/chosen": -0.6055313944816589, + "logits/rejected": -0.5660229325294495, + "logps/chosen": -1.3449480533599854, + "logps/rejected": -1.3947374820709229, + "loss": 1.2475, + "odds_ratio_loss": 0.701248824596405, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0672474056482315, + "rewards/margins": 0.0024894648231565952, + "rewards/rejected": -0.06973687559366226, + "sft_loss": 1.3449480533599854, + "step": 785 + }, + { + "epoch": 0.632, + "grad_norm": 11.28386887635978, + "learning_rate": 4.815773989205165e-06, + "logits/chosen": -0.4913761615753174, + "logits/rejected": -0.7349493503570557, + "logps/chosen": -1.3903801441192627, + "logps/rejected": -1.261516809463501, + "loss": 1.2877, + "odds_ratio_loss": 0.858357310295105, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06951900571584702, + "rewards/margins": -0.006443170364946127, + "rewards/rejected": -0.06307584047317505, + "sft_loss": 1.3903801441192627, + "step": 790 + }, + { + "epoch": 0.636, + "grad_norm": 15.974062267930158, + "learning_rate": 4.811365084030784e-06, + "logits/chosen": -0.6103811264038086, + "logits/rejected": -0.6908004879951477, + "logps/chosen": -1.1836979389190674, + "logps/rejected": -1.4412634372711182, + "loss": 1.2186, + "odds_ratio_loss": 0.599416971206665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05918489769101143, + "rewards/margins": 0.012878276407718658, + "rewards/rejected": -0.07206316292285919, + "sft_loss": 1.1836979389190674, + "step": 795 + }, + { + "epoch": 0.64, + "grad_norm": 9.332345967135447, + "learning_rate": 4.806906110888606e-06, + "logits/chosen": -0.10099928081035614, + "logits/rejected": -0.21330437064170837, + "logps/chosen": -0.9208562970161438, + "logps/rejected": -1.2308391332626343, + "loss": 1.2217, + "odds_ratio_loss": 0.5371675491333008, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04604281857609749, + "rewards/margins": 0.01549914013594389, + "rewards/rejected": -0.061541955918073654, + "sft_loss": 0.9208562970161438, + "step": 800 + }, + { + "epoch": 0.644, + "grad_norm": 6.712188908944328, + "learning_rate": 4.8023971663673235e-06, + "logits/chosen": -0.45073431730270386, + "logits/rejected": -0.6796762347221375, + "logps/chosen": -1.2248754501342773, + "logps/rejected": -1.1859427690505981, + "loss": 1.2317, + "odds_ratio_loss": 0.9216899871826172, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06124377250671387, + "rewards/margins": -0.0019466314697638154, + "rewards/rejected": -0.059297144412994385, + "sft_loss": 1.2248754501342773, + "step": 805 + }, + { + "epoch": 0.648, + "grad_norm": 6.089598640062157, + "learning_rate": 4.7978383481380865e-06, + "logits/chosen": -0.7606781125068665, + "logits/rejected": -0.34865307807922363, + "logps/chosen": -1.3903577327728271, + "logps/rejected": -1.3126481771469116, + "loss": 1.2731, + "odds_ratio_loss": 0.8095407485961914, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06951788067817688, + "rewards/margins": -0.003885473357513547, + "rewards/rejected": -0.0656324103474617, + "sft_loss": 1.3903577327728271, + "step": 810 + }, + { + "epoch": 0.652, + "grad_norm": 8.775544099907432, + "learning_rate": 4.793229754952393e-06, + "logits/chosen": -0.9465087652206421, + "logits/rejected": -0.8528397679328918, + "logps/chosen": -1.3370808362960815, + "logps/rejected": -1.6353280544281006, + "loss": 1.2299, + "odds_ratio_loss": 0.5765669941902161, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06685404479503632, + "rewards/margins": 0.01491236686706543, + "rewards/rejected": -0.08176640421152115, + "sft_loss": 1.3370808362960815, + "step": 815 + }, + { + "epoch": 0.656, + "grad_norm": 9.946349607495078, + "learning_rate": 4.788571486639948e-06, + "logits/chosen": -0.538291335105896, + "logits/rejected": -0.8991764783859253, + "logps/chosen": -1.3249986171722412, + "logps/rejected": -1.2387126684188843, + "loss": 1.1861, + "odds_ratio_loss": 0.8351901173591614, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06624993681907654, + "rewards/margins": -0.004314306192100048, + "rewards/rejected": -0.061935633420944214, + "sft_loss": 1.3249986171722412, + "step": 820 + }, + { + "epoch": 0.66, + "grad_norm": 13.684922078518843, + "learning_rate": 4.783863644106502e-06, + "logits/chosen": -0.40957608819007874, + "logits/rejected": -1.0564932823181152, + "logps/chosen": -1.2083319425582886, + "logps/rejected": -1.3241065740585327, + "loss": 1.2727, + "odds_ratio_loss": 0.7248133420944214, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06041660159826279, + "rewards/margins": 0.005788723472505808, + "rewards/rejected": -0.06620533019304276, + "sft_loss": 1.2083319425582886, + "step": 825 + }, + { + "epoch": 0.664, + "grad_norm": 14.240624119385929, + "learning_rate": 4.779106329331665e-06, + "logits/chosen": -0.6208971738815308, + "logits/rejected": -0.53315269947052, + "logps/chosen": -1.327461838722229, + "logps/rejected": -1.1612827777862549, + "loss": 1.3201, + "odds_ratio_loss": 0.9040085077285767, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06637309491634369, + "rewards/margins": -0.008308951742947102, + "rewards/rejected": -0.058064140379428864, + "sft_loss": 1.327461838722229, + "step": 830 + }, + { + "epoch": 0.668, + "grad_norm": 6.175231699145468, + "learning_rate": 4.774299645366696e-06, + "logits/chosen": -0.5456127524375916, + "logits/rejected": -0.816623330116272, + "logps/chosen": -1.1675437688827515, + "logps/rejected": -1.4906387329101562, + "loss": 1.2384, + "odds_ratio_loss": 0.5253651142120361, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.058377187699079514, + "rewards/margins": 0.016154741868376732, + "rewards/rejected": -0.0745319351553917, + "sft_loss": 1.1675437688827515, + "step": 835 + }, + { + "epoch": 0.672, + "grad_norm": 9.640667146514005, + "learning_rate": 4.769443696332272e-06, + "logits/chosen": -0.2830110788345337, + "logits/rejected": -0.8570443391799927, + "logps/chosen": -1.2739179134368896, + "logps/rejected": -1.4723364114761353, + "loss": 1.223, + "odds_ratio_loss": 0.7033334374427795, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06369589269161224, + "rewards/margins": 0.009920930489897728, + "rewards/rejected": -0.07361682504415512, + "sft_loss": 1.2739179134368896, + "step": 840 + }, + { + "epoch": 0.676, + "grad_norm": 12.645968569930028, + "learning_rate": 4.764538587416233e-06, + "logits/chosen": -0.8710149526596069, + "logits/rejected": -0.8611608743667603, + "logps/chosen": -1.452150583267212, + "logps/rejected": -2.0592668056488037, + "loss": 1.3282, + "odds_ratio_loss": 0.6635312438011169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07260753214359283, + "rewards/margins": 0.03035581484436989, + "rewards/rejected": -0.10296335071325302, + "sft_loss": 1.452150583267212, + "step": 845 + }, + { + "epoch": 0.68, + "grad_norm": 9.432527411843669, + "learning_rate": 4.759584424871302e-06, + "logits/chosen": -0.5329976081848145, + "logits/rejected": -0.8772062063217163, + "logps/chosen": -0.9151167869567871, + "logps/rejected": -1.6539852619171143, + "loss": 1.2911, + "odds_ratio_loss": 0.4387364387512207, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.045755840837955475, + "rewards/margins": 0.03694342449307442, + "rewards/rejected": -0.082699254155159, + "sft_loss": 0.9151167869567871, + "step": 850 + }, + { + "epoch": 0.684, + "grad_norm": 7.726661197517004, + "learning_rate": 4.754581316012785e-06, + "logits/chosen": -0.4633726477622986, + "logits/rejected": -0.5861400961875916, + "logps/chosen": -0.6932090520858765, + "logps/rejected": -1.0552204847335815, + "loss": 1.1924, + "odds_ratio_loss": 0.4030598998069763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034660451114177704, + "rewards/margins": 0.018100565299391747, + "rewards/rejected": -0.0527610182762146, + "sft_loss": 0.6932090520858765, + "step": 855 + }, + { + "epoch": 0.688, + "grad_norm": 9.894504014422436, + "learning_rate": 4.749529369216246e-06, + "logits/chosen": -0.42406773567199707, + "logits/rejected": -0.4842946529388428, + "logps/chosen": -1.0714144706726074, + "logps/rejected": -1.1619442701339722, + "loss": 1.352, + "odds_ratio_loss": 0.6463958024978638, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05357072502374649, + "rewards/margins": 0.0045264954678714275, + "rewards/rejected": -0.05809721350669861, + "sft_loss": 1.0714144706726074, + "step": 860 + }, + { + "epoch": 0.692, + "grad_norm": 9.153094977007724, + "learning_rate": 4.744428693915158e-06, + "logits/chosen": -0.2847048342227936, + "logits/rejected": -0.4418734610080719, + "logps/chosen": -1.2593896389007568, + "logps/rejected": -0.9671368598937988, + "loss": 1.3217, + "odds_ratio_loss": 0.9432659149169922, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -0.06296949088573456, + "rewards/margins": -0.014612642116844654, + "rewards/rejected": -0.04835684597492218, + "sft_loss": 1.2593896389007568, + "step": 865 + }, + { + "epoch": 0.696, + "grad_norm": 7.840444386866316, + "learning_rate": 4.7392794005985324e-06, + "logits/chosen": -0.4391769468784332, + "logits/rejected": -0.9956706762313843, + "logps/chosen": -1.0819652080535889, + "logps/rejected": -1.3912889957427979, + "loss": 1.1336, + "odds_ratio_loss": 0.5440673232078552, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.054098259657621384, + "rewards/margins": 0.01546618901193142, + "rewards/rejected": -0.06956445425748825, + "sft_loss": 1.0819652080535889, + "step": 870 + }, + { + "epoch": 0.7, + "grad_norm": 12.411494501286214, + "learning_rate": 4.734081600808531e-06, + "logits/chosen": -0.9742234349250793, + "logits/rejected": -0.4317501485347748, + "logps/chosen": -1.1081851720809937, + "logps/rejected": -1.167130947113037, + "loss": 1.2256, + "odds_ratio_loss": 0.6975724697113037, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0554092600941658, + "rewards/margins": 0.002947291126474738, + "rewards/rejected": -0.058356545865535736, + "sft_loss": 1.1081851720809937, + "step": 875 + }, + { + "epoch": 0.704, + "grad_norm": 8.306111344864336, + "learning_rate": 4.7288354071380415e-06, + "logits/chosen": -0.5642324686050415, + "logits/rejected": -0.8995448350906372, + "logps/chosen": -1.0954984426498413, + "logps/rejected": -1.2880576848983765, + "loss": 1.2105, + "odds_ratio_loss": 0.6666485071182251, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0547749288380146, + "rewards/margins": 0.009627962484955788, + "rewards/rejected": -0.06440288573503494, + "sft_loss": 1.0954984426498413, + "step": 880 + }, + { + "epoch": 0.708, + "grad_norm": 10.907946303534318, + "learning_rate": 4.723540933228245e-06, + "logits/chosen": -0.8124354481697083, + "logits/rejected": -0.5376805067062378, + "logps/chosen": -1.105232834815979, + "logps/rejected": -1.4322589635849, + "loss": 1.2672, + "odds_ratio_loss": 0.5457401871681213, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05526164174079895, + "rewards/margins": 0.016351303085684776, + "rewards/rejected": -0.07161294668912888, + "sft_loss": 1.105232834815979, + "step": 885 + }, + { + "epoch": 0.712, + "grad_norm": 11.344492337732651, + "learning_rate": 4.7181982937661485e-06, + "logits/chosen": -0.6914325952529907, + "logits/rejected": -0.9896718859672546, + "logps/chosen": -1.3581817150115967, + "logps/rejected": -1.8004964590072632, + "loss": 1.2577, + "odds_ratio_loss": 0.5711563229560852, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06790908426046371, + "rewards/margins": 0.02211574651300907, + "rewards/rejected": -0.09002482891082764, + "sft_loss": 1.3581817150115967, + "step": 890 + }, + { + "epoch": 0.716, + "grad_norm": 14.944195714469451, + "learning_rate": 4.712807604482108e-06, + "logits/chosen": -0.580794632434845, + "logits/rejected": -0.6542994379997253, + "logps/chosen": -1.1799992322921753, + "logps/rejected": -1.1197493076324463, + "loss": 1.2151, + "odds_ratio_loss": 0.8679493069648743, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.058999963104724884, + "rewards/margins": -0.0030124965123832226, + "rewards/rejected": -0.05598746985197067, + "sft_loss": 1.1799992322921753, + "step": 895 + }, + { + "epoch": 0.72, + "grad_norm": 6.69431914838341, + "learning_rate": 4.707368982147318e-06, + "logits/chosen": -0.26036280393600464, + "logits/rejected": -0.8896979093551636, + "logps/chosen": -1.174317479133606, + "logps/rejected": -0.9866326451301575, + "loss": 1.2563, + "odds_ratio_loss": 1.064286470413208, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.058715879917144775, + "rewards/margins": -0.009384247474372387, + "rewards/rejected": -0.049331631511449814, + "sft_loss": 1.174317479133606, + "step": 900 + }, + { + "epoch": 0.724, + "grad_norm": 4.7835304167015975, + "learning_rate": 4.701882544571277e-06, + "logits/chosen": -0.9345399141311646, + "logits/rejected": -0.6917155385017395, + "logps/chosen": -1.392803430557251, + "logps/rejected": -1.371321439743042, + "loss": 1.1668, + "odds_ratio_loss": 0.7409270405769348, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06964017450809479, + "rewards/margins": -0.001074108062312007, + "rewards/rejected": -0.06856606900691986, + "sft_loss": 1.392803430557251, + "step": 905 + }, + { + "epoch": 0.728, + "grad_norm": 11.02550077384499, + "learning_rate": 4.696348410599244e-06, + "logits/chosen": -0.5705204010009766, + "logits/rejected": -0.6748573184013367, + "logps/chosen": -1.0039920806884766, + "logps/rejected": -1.592974066734314, + "loss": 1.2679, + "odds_ratio_loss": 0.5323291420936584, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05019960552453995, + "rewards/margins": 0.029449105262756348, + "rewards/rejected": -0.0796487107872963, + "sft_loss": 1.0039920806884766, + "step": 910 + }, + { + "epoch": 0.732, + "grad_norm": 10.692588852434419, + "learning_rate": 4.690766700109659e-06, + "logits/chosen": -0.4835734963417053, + "logits/rejected": -0.7319347858428955, + "logps/chosen": -1.1647623777389526, + "logps/rejected": -2.5557546615600586, + "loss": 1.2441, + "odds_ratio_loss": 0.41198450326919556, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05823811888694763, + "rewards/margins": 0.06954962015151978, + "rewards/rejected": -0.1277877390384674, + "sft_loss": 1.1647623777389526, + "step": 915 + }, + { + "epoch": 0.736, + "grad_norm": 10.396971419498014, + "learning_rate": 4.685137534011549e-06, + "logits/chosen": -0.8939110040664673, + "logits/rejected": -0.3123689293861389, + "logps/chosen": -1.2236140966415405, + "logps/rejected": -1.3663610219955444, + "loss": 1.1922, + "odds_ratio_loss": 0.7821038365364075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06118069961667061, + "rewards/margins": 0.007137349806725979, + "rewards/rejected": -0.06831805408000946, + "sft_loss": 1.2236140966415405, + "step": 920 + }, + { + "epoch": 0.74, + "grad_norm": 8.738002331287573, + "learning_rate": 4.679461034241906e-06, + "logits/chosen": -0.6164524555206299, + "logits/rejected": -0.7329466938972473, + "logps/chosen": -1.4917128086090088, + "logps/rejected": -1.479864478111267, + "loss": 1.2453, + "odds_ratio_loss": 0.770171046257019, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07458565384149551, + "rewards/margins": -0.0005924153956584632, + "rewards/rejected": -0.07399322092533112, + "sft_loss": 1.4917128086090088, + "step": 925 + }, + { + "epoch": 0.744, + "grad_norm": 23.09910613175578, + "learning_rate": 4.673737323763048e-06, + "logits/chosen": -0.6520587205886841, + "logits/rejected": -0.7833204865455627, + "logps/chosen": -1.1136635541915894, + "logps/rejected": -1.1649823188781738, + "loss": 1.1591, + "odds_ratio_loss": 0.724600613117218, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05568317696452141, + "rewards/margins": 0.0025659373495727777, + "rewards/rejected": -0.05824911594390869, + "sft_loss": 1.1136635541915894, + "step": 930 + }, + { + "epoch": 0.748, + "grad_norm": 9.576889593824372, + "learning_rate": 4.667966526559953e-06, + "logits/chosen": -0.7509557008743286, + "logits/rejected": -0.49857956171035767, + "logps/chosen": -1.3980618715286255, + "logps/rejected": -1.2008607387542725, + "loss": 1.3096, + "odds_ratio_loss": 0.9694040417671204, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06990310549736023, + "rewards/margins": -0.009860062971711159, + "rewards/rejected": -0.06004303693771362, + "sft_loss": 1.3980618715286255, + "step": 935 + }, + { + "epoch": 0.752, + "grad_norm": 8.475227307177539, + "learning_rate": 4.662148767637578e-06, + "logits/chosen": -0.5015154480934143, + "logits/rejected": -0.7089800834655762, + "logps/chosen": -1.1982038021087646, + "logps/rejected": -1.4972496032714844, + "loss": 1.2186, + "odds_ratio_loss": 0.5610604286193848, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.059910185635089874, + "rewards/margins": 0.014952296391129494, + "rewards/rejected": -0.07486248016357422, + "sft_loss": 1.1982038021087646, + "step": 940 + }, + { + "epoch": 0.756, + "grad_norm": 10.570484526302579, + "learning_rate": 4.656284173018144e-06, + "logits/chosen": -0.6629132628440857, + "logits/rejected": -0.8142105340957642, + "logps/chosen": -1.2457422018051147, + "logps/rejected": -1.447788953781128, + "loss": 1.2879, + "odds_ratio_loss": 0.5766414403915405, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0622871108353138, + "rewards/margins": 0.010102340951561928, + "rewards/rejected": -0.07238946110010147, + "sft_loss": 1.2457422018051147, + "step": 945 + }, + { + "epoch": 0.76, + "grad_norm": 8.792884078555856, + "learning_rate": 4.650372869738415e-06, + "logits/chosen": -0.38089218735694885, + "logits/rejected": -0.724978506565094, + "logps/chosen": -1.0249511003494263, + "logps/rejected": -1.4063374996185303, + "loss": 1.3351, + "odds_ratio_loss": 0.5218092799186707, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05124755948781967, + "rewards/margins": 0.01906932331621647, + "rewards/rejected": -0.07031688839197159, + "sft_loss": 1.0249511003494263, + "step": 950 + }, + { + "epoch": 0.764, + "grad_norm": 9.019548926117617, + "learning_rate": 4.644414985846934e-06, + "logits/chosen": -0.5280020833015442, + "logits/rejected": -1.183532953262329, + "logps/chosen": -1.1438463926315308, + "logps/rejected": -1.5815012454986572, + "loss": 1.311, + "odds_ratio_loss": 0.4890708029270172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.057192325592041016, + "rewards/margins": 0.02188274636864662, + "rewards/rejected": -0.07907506823539734, + "sft_loss": 1.1438463926315308, + "step": 955 + }, + { + "epoch": 0.768, + "grad_norm": 10.010010873228056, + "learning_rate": 4.638410650401267e-06, + "logits/chosen": -0.6906821131706238, + "logits/rejected": -0.42066025733947754, + "logps/chosen": -0.9353683590888977, + "logps/rejected": -1.1939796209335327, + "loss": 1.2082, + "odds_ratio_loss": 0.6122977137565613, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.046768419444561005, + "rewards/margins": 0.012930569238960743, + "rewards/rejected": -0.059698980301618576, + "sft_loss": 0.9353683590888977, + "step": 960 + }, + { + "epoch": 0.772, + "grad_norm": 13.636523513835607, + "learning_rate": 4.632359993465188e-06, + "logits/chosen": -0.7019423842430115, + "logits/rejected": -0.8399526476860046, + "logps/chosen": -1.0443174839019775, + "logps/rejected": -1.1119390726089478, + "loss": 1.2281, + "odds_ratio_loss": 0.7568255066871643, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05221586674451828, + "rewards/margins": 0.0033810839522629976, + "rewards/rejected": -0.055596958845853806, + "sft_loss": 1.0443174839019775, + "step": 965 + }, + { + "epoch": 0.776, + "grad_norm": 10.680468517829564, + "learning_rate": 4.626263146105875e-06, + "logits/chosen": -0.7569029927253723, + "logits/rejected": -1.1442753076553345, + "logps/chosen": -1.0383312702178955, + "logps/rejected": -1.2682640552520752, + "loss": 1.2608, + "odds_ratio_loss": 0.715472936630249, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.051916562020778656, + "rewards/margins": 0.011496638879179955, + "rewards/rejected": -0.06341320276260376, + "sft_loss": 1.0383312702178955, + "step": 970 + }, + { + "epoch": 0.78, + "grad_norm": 16.270504514158986, + "learning_rate": 4.620120240391065e-06, + "logits/chosen": -0.5434718728065491, + "logits/rejected": -0.8585309982299805, + "logps/chosen": -1.0939325094223022, + "logps/rejected": -1.3599112033843994, + "loss": 1.2639, + "odds_ratio_loss": 0.7648831605911255, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.05469663068652153, + "rewards/margins": 0.01329893060028553, + "rewards/rejected": -0.06799556314945221, + "sft_loss": 1.0939325094223022, + "step": 975 + }, + { + "epoch": 0.784, + "grad_norm": 5.804422084174317, + "learning_rate": 4.613931409386196e-06, + "logits/chosen": -0.7977913022041321, + "logits/rejected": -0.9274693727493286, + "logps/chosen": -1.18310546875, + "logps/rejected": -1.8053483963012695, + "loss": 1.2574, + "odds_ratio_loss": 0.4495466351509094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.059155285358428955, + "rewards/margins": 0.031112130731344223, + "rewards/rejected": -0.09026740491390228, + "sft_loss": 1.18310546875, + "step": 980 + }, + { + "epoch": 0.788, + "grad_norm": 9.788416898738078, + "learning_rate": 4.607696787151522e-06, + "logits/chosen": -0.9438807368278503, + "logits/rejected": -0.6228753328323364, + "logps/chosen": -1.5850467681884766, + "logps/rejected": -1.510814905166626, + "loss": 1.1717, + "odds_ratio_loss": 0.8166835904121399, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07925233989953995, + "rewards/margins": -0.0037115835584700108, + "rewards/rejected": -0.07554075121879578, + "sft_loss": 1.5850467681884766, + "step": 985 + }, + { + "epoch": 0.792, + "grad_norm": 12.55818469153721, + "learning_rate": 4.601416508739211e-06, + "logits/chosen": -0.731402575969696, + "logits/rejected": -1.0273559093475342, + "logps/chosen": -1.129149317741394, + "logps/rejected": -1.219930648803711, + "loss": 1.2695, + "odds_ratio_loss": 0.6738437414169312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05645746737718582, + "rewards/margins": 0.0045390622690320015, + "rewards/rejected": -0.06099652498960495, + "sft_loss": 1.129149317741394, + "step": 990 + }, + { + "epoch": 0.796, + "grad_norm": 9.153552767046978, + "learning_rate": 4.595090710190419e-06, + "logits/chosen": -0.7405273914337158, + "logits/rejected": -1.1718242168426514, + "logps/chosen": -1.2145724296569824, + "logps/rejected": -1.4828448295593262, + "loss": 1.1849, + "odds_ratio_loss": 0.5643509030342102, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06072862073779106, + "rewards/margins": 0.013413618318736553, + "rewards/rejected": -0.07414223998785019, + "sft_loss": 1.2145724296569824, + "step": 995 + }, + { + "epoch": 0.8, + "grad_norm": 6.94868849947181, + "learning_rate": 4.588719528532342e-06, + "logits/chosen": -0.7134780883789062, + "logits/rejected": -0.905554473400116, + "logps/chosen": -1.4175317287445068, + "logps/rejected": -1.3552976846694946, + "loss": 1.3012, + "odds_ratio_loss": 0.8563035130500793, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.0708765834569931, + "rewards/margins": -0.003111699130386114, + "rewards/rejected": -0.06776488572359085, + "sft_loss": 1.4175317287445068, + "step": 1000 + }, + { + "epoch": 0.804, + "grad_norm": 12.921749906218944, + "learning_rate": 4.582303101775249e-06, + "logits/chosen": -0.7810646891593933, + "logits/rejected": -1.1520473957061768, + "logps/chosen": -1.2127867937088013, + "logps/rejected": -1.4734896421432495, + "loss": 1.2652, + "odds_ratio_loss": 0.5705010890960693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06063934043049812, + "rewards/margins": 0.013035142794251442, + "rewards/rejected": -0.07367448508739471, + "sft_loss": 1.2127867937088013, + "step": 1005 + }, + { + "epoch": 0.808, + "grad_norm": 9.197435023909987, + "learning_rate": 4.575841568909494e-06, + "logits/chosen": -0.6398459672927856, + "logits/rejected": -0.5644738078117371, + "logps/chosen": -1.268541932106018, + "logps/rejected": -1.5668294429779053, + "loss": 1.2232, + "odds_ratio_loss": 0.6429783701896667, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06342709064483643, + "rewards/margins": 0.014914381317794323, + "rewards/rejected": -0.07834147661924362, + "sft_loss": 1.268541932106018, + "step": 1010 + }, + { + "epoch": 0.812, + "grad_norm": 5.987738310425259, + "learning_rate": 4.569335069902502e-06, + "logits/chosen": -0.8146150708198547, + "logits/rejected": -0.5185267925262451, + "logps/chosen": -1.2354018688201904, + "logps/rejected": -1.3597724437713623, + "loss": 1.2304, + "odds_ratio_loss": 0.7136649489402771, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06177009269595146, + "rewards/margins": 0.006218533962965012, + "rewards/rejected": -0.06798862665891647, + "sft_loss": 1.2354018688201904, + "step": 1015 + }, + { + "epoch": 0.816, + "grad_norm": 11.820317098577457, + "learning_rate": 4.562783745695738e-06, + "logits/chosen": -0.6676985621452332, + "logits/rejected": -0.3732604682445526, + "logps/chosen": -1.0979827642440796, + "logps/rejected": -1.6110618114471436, + "loss": 1.2048, + "odds_ratio_loss": 0.6128841638565063, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05489913746714592, + "rewards/margins": 0.025653952732682228, + "rewards/rejected": -0.0805530920624733, + "sft_loss": 1.0979827642440796, + "step": 1020 + }, + { + "epoch": 0.82, + "grad_norm": 12.430320372288909, + "learning_rate": 4.556187738201656e-06, + "logits/chosen": -0.3898774981498718, + "logits/rejected": -1.3098582029342651, + "logps/chosen": -0.9205626249313354, + "logps/rejected": -1.4675109386444092, + "loss": 1.2001, + "odds_ratio_loss": 0.6006077527999878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04602813720703125, + "rewards/margins": 0.02734740450978279, + "rewards/rejected": -0.07337555289268494, + "sft_loss": 0.9205626249313354, + "step": 1025 + }, + { + "epoch": 0.824, + "grad_norm": 8.240787854800642, + "learning_rate": 4.549547190300622e-06, + "logits/chosen": -0.749883234500885, + "logits/rejected": -0.5567277669906616, + "logps/chosen": -1.1923110485076904, + "logps/rejected": -1.448547124862671, + "loss": 1.2761, + "odds_ratio_loss": 0.5901089906692505, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05961555242538452, + "rewards/margins": 0.012811805121600628, + "rewards/rejected": -0.07242736220359802, + "sft_loss": 1.1923110485076904, + "step": 1030 + }, + { + "epoch": 0.828, + "grad_norm": 8.576070028214076, + "learning_rate": 4.542862245837821e-06, + "logits/chosen": -0.4225188195705414, + "logits/rejected": -0.694170355796814, + "logps/chosen": -1.115942358970642, + "logps/rejected": -1.1966840028762817, + "loss": 1.3326, + "odds_ratio_loss": 0.7018226385116577, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.055797118693590164, + "rewards/margins": 0.004037079401314259, + "rewards/rejected": -0.05983419343829155, + "sft_loss": 1.115942358970642, + "step": 1035 + }, + { + "epoch": 0.832, + "grad_norm": 8.003040390950462, + "learning_rate": 4.536133049620143e-06, + "logits/chosen": -0.614066481590271, + "logits/rejected": -0.9744704365730286, + "logps/chosen": -1.1512901782989502, + "logps/rejected": -1.686820387840271, + "loss": 1.3049, + "odds_ratio_loss": 0.5957285761833191, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05756450816988945, + "rewards/margins": 0.0267765112221241, + "rewards/rejected": -0.08434101194143295, + "sft_loss": 1.1512901782989502, + "step": 1040 + }, + { + "epoch": 0.836, + "grad_norm": 7.214120995777482, + "learning_rate": 4.529359747413038e-06, + "logits/chosen": -0.602199912071228, + "logits/rejected": -1.299626350402832, + "logps/chosen": -1.2445770502090454, + "logps/rejected": -1.3810675144195557, + "loss": 1.3109, + "odds_ratio_loss": 0.7491277456283569, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.062228865921497345, + "rewards/margins": 0.00682451855391264, + "rewards/rejected": -0.06905338168144226, + "sft_loss": 1.2445770502090454, + "step": 1045 + }, + { + "epoch": 0.84, + "grad_norm": 7.796605464265155, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -0.3931456208229065, + "logits/rejected": -0.8093290328979492, + "logps/chosen": -1.3564033508300781, + "logps/rejected": -1.3207740783691406, + "loss": 1.2757, + "odds_ratio_loss": 0.8102067708969116, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06782017648220062, + "rewards/margins": -0.0017814624588936567, + "rewards/rejected": -0.06603871285915375, + "sft_loss": 1.3564033508300781, + "step": 1050 + }, + { + "epoch": 0.844, + "grad_norm": 11.038841006173273, + "learning_rate": 4.515681412866228e-06, + "logits/chosen": -0.8583968281745911, + "logits/rejected": -0.5115640163421631, + "logps/chosen": -1.3405869007110596, + "logps/rejected": -1.1792690753936768, + "loss": 1.2427, + "odds_ratio_loss": 1.1266233921051025, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06702934950590134, + "rewards/margins": -0.008065891452133656, + "rewards/rejected": -0.058963458985090256, + "sft_loss": 1.3405869007110596, + "step": 1055 + }, + { + "epoch": 0.848, + "grad_norm": 37.58723049143948, + "learning_rate": 4.508776676821739e-06, + "logits/chosen": -0.6703466773033142, + "logits/rejected": -0.9433542490005493, + "logps/chosen": -1.2843058109283447, + "logps/rejected": -1.6363897323608398, + "loss": 1.359, + "odds_ratio_loss": 0.5473671555519104, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0642152950167656, + "rewards/margins": 0.017604198306798935, + "rewards/rejected": -0.08181948959827423, + "sft_loss": 1.2843058109283447, + "step": 1060 + }, + { + "epoch": 0.852, + "grad_norm": 12.74608944254742, + "learning_rate": 4.501828427371834e-06, + "logits/chosen": -0.3732682168483734, + "logits/rejected": -0.7298794984817505, + "logps/chosen": -1.0062689781188965, + "logps/rejected": -1.6333110332489014, + "loss": 1.225, + "odds_ratio_loss": 0.45508384704589844, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.050313450396060944, + "rewards/margins": 0.031352099031209946, + "rewards/rejected": -0.08166555315256119, + "sft_loss": 1.0062689781188965, + "step": 1065 + }, + { + "epoch": 0.856, + "grad_norm": 4.647181122267829, + "learning_rate": 4.494836815027022e-06, + "logits/chosen": -0.744674026966095, + "logits/rejected": -0.5776953101158142, + "logps/chosen": -1.1960818767547607, + "logps/rejected": -1.277769684791565, + "loss": 1.1808, + "odds_ratio_loss": 0.6548304557800293, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05980409309267998, + "rewards/margins": 0.0040843975730240345, + "rewards/rejected": -0.06388849020004272, + "sft_loss": 1.1960818767547607, + "step": 1070 + }, + { + "epoch": 0.86, + "grad_norm": 9.112232033291143, + "learning_rate": 4.48780199123712e-06, + "logits/chosen": -0.48859691619873047, + "logits/rejected": -0.8226814270019531, + "logps/chosen": -0.9840165972709656, + "logps/rejected": -1.3010642528533936, + "loss": 1.3173, + "odds_ratio_loss": 0.5319587588310242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04920082911849022, + "rewards/margins": 0.01585238054394722, + "rewards/rejected": -0.06505320966243744, + "sft_loss": 0.9840165972709656, + "step": 1075 + }, + { + "epoch": 0.864, + "grad_norm": 8.357848437646894, + "learning_rate": 4.4807241083879774e-06, + "logits/chosen": -0.7419109344482422, + "logits/rejected": -0.5743356347084045, + "logps/chosen": -1.0393415689468384, + "logps/rejected": -1.860235571861267, + "loss": 1.1895, + "odds_ratio_loss": 0.3943214416503906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0519670769572258, + "rewards/margins": 0.0410446934401989, + "rewards/rejected": -0.0930117815732956, + "sft_loss": 1.0393415689468384, + "step": 1080 + }, + { + "epoch": 0.868, + "grad_norm": 7.001917916478807, + "learning_rate": 4.473603319798173e-06, + "logits/chosen": -0.8395317196846008, + "logits/rejected": -0.760552704334259, + "logps/chosen": -1.2476990222930908, + "logps/rejected": -1.3389023542404175, + "loss": 1.2252, + "odds_ratio_loss": 0.6428620219230652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0623849555850029, + "rewards/margins": 0.004560163244605064, + "rewards/rejected": -0.06694512069225311, + "sft_loss": 1.2476990222930908, + "step": 1085 + }, + { + "epoch": 0.872, + "grad_norm": 7.900019571893691, + "learning_rate": 4.466439779715696e-06, + "logits/chosen": -0.68291836977005, + "logits/rejected": -0.886945903301239, + "logps/chosen": -1.321554183959961, + "logps/rejected": -1.8369743824005127, + "loss": 1.2689, + "odds_ratio_loss": 0.5644279718399048, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06607771664857864, + "rewards/margins": 0.02577100321650505, + "rewards/rejected": -0.0918487161397934, + "sft_loss": 1.321554183959961, + "step": 1090 + }, + { + "epoch": 0.876, + "grad_norm": 8.642864900662888, + "learning_rate": 4.4592336433146e-06, + "logits/chosen": -0.633298397064209, + "logits/rejected": -1.161464810371399, + "logps/chosen": -1.0371453762054443, + "logps/rejected": -1.3207931518554688, + "loss": 1.1633, + "odds_ratio_loss": 0.5788105726242065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.051857270300388336, + "rewards/margins": 0.01418238878250122, + "rewards/rejected": -0.06603965908288956, + "sft_loss": 1.0371453762054443, + "step": 1095 + }, + { + "epoch": 0.88, + "grad_norm": 9.05386821771176, + "learning_rate": 4.451985066691649e-06, + "logits/chosen": -0.7766702771186829, + "logits/rejected": -0.5547041893005371, + "logps/chosen": -1.1104316711425781, + "logps/rejected": -1.3220654726028442, + "loss": 1.3197, + "odds_ratio_loss": 0.6292269229888916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.055521585047245026, + "rewards/margins": 0.010581688955426216, + "rewards/rejected": -0.06610327214002609, + "sft_loss": 1.1104316711425781, + "step": 1100 + }, + { + "epoch": 0.884, + "grad_norm": 6.120618129770535, + "learning_rate": 4.444694206862929e-06, + "logits/chosen": -0.7643265128135681, + "logits/rejected": -0.858683705329895, + "logps/chosen": -1.2096381187438965, + "logps/rejected": -1.3979839086532593, + "loss": 1.1843, + "odds_ratio_loss": 0.7661855220794678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06048189848661423, + "rewards/margins": 0.00941728986799717, + "rewards/rejected": -0.06989918649196625, + "sft_loss": 1.2096381187438965, + "step": 1105 + }, + { + "epoch": 0.888, + "grad_norm": 11.782139121174437, + "learning_rate": 4.437361221760449e-06, + "logits/chosen": -0.5270341634750366, + "logits/rejected": -0.7325950264930725, + "logps/chosen": -1.3420865535736084, + "logps/rejected": -1.8302757740020752, + "loss": 1.2491, + "odds_ratio_loss": 0.6886317133903503, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06710433214902878, + "rewards/margins": 0.024409450590610504, + "rewards/rejected": -0.09151377528905869, + "sft_loss": 1.3420865535736084, + "step": 1110 + }, + { + "epoch": 0.892, + "grad_norm": 18.68798830441005, + "learning_rate": 4.4299862702287255e-06, + "logits/chosen": -0.5396694540977478, + "logits/rejected": -0.6563701033592224, + "logps/chosen": -1.2778229713439941, + "logps/rejected": -1.229236364364624, + "loss": 1.2489, + "odds_ratio_loss": 0.869365394115448, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06389115005731583, + "rewards/margins": -0.0024293330498039722, + "rewards/rejected": -0.06146181747317314, + "sft_loss": 1.2778229713439941, + "step": 1115 + }, + { + "epoch": 0.896, + "grad_norm": 8.493094923954416, + "learning_rate": 4.422569512021332e-06, + "logits/chosen": -0.43230313062667847, + "logits/rejected": -1.0870712995529175, + "logps/chosen": -1.1300268173217773, + "logps/rejected": -1.6185880899429321, + "loss": 1.186, + "odds_ratio_loss": 0.4703669548034668, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05650133639574051, + "rewards/margins": 0.02442805841565132, + "rewards/rejected": -0.08092939108610153, + "sft_loss": 1.1300268173217773, + "step": 1120 + }, + { + "epoch": 0.9, + "grad_norm": 5.267928354352151, + "learning_rate": 4.415111107797445e-06, + "logits/chosen": -0.5481151342391968, + "logits/rejected": -0.9923849105834961, + "logps/chosen": -1.230312466621399, + "logps/rejected": -1.5415050983428955, + "loss": 1.2303, + "odds_ratio_loss": 0.5594199299812317, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.061515629291534424, + "rewards/margins": 0.015559625811874866, + "rewards/rejected": -0.07707525044679642, + "sft_loss": 1.230312466621399, + "step": 1125 + }, + { + "epoch": 0.904, + "grad_norm": 8.086520043398188, + "learning_rate": 4.407611219118363e-06, + "logits/chosen": -0.3550862669944763, + "logits/rejected": -0.8939773440361023, + "logps/chosen": -1.110655665397644, + "logps/rejected": -1.5090737342834473, + "loss": 1.2223, + "odds_ratio_loss": 0.6366047263145447, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0555327832698822, + "rewards/margins": 0.01992090605199337, + "rewards/rejected": -0.07545368373394012, + "sft_loss": 1.110655665397644, + "step": 1130 + }, + { + "epoch": 0.908, + "grad_norm": 8.346110146833315, + "learning_rate": 4.4000700084440046e-06, + "logits/chosen": -0.468745619058609, + "logits/rejected": -1.0615545511245728, + "logps/chosen": -1.219507098197937, + "logps/rejected": -1.5035933256149292, + "loss": 1.2581, + "odds_ratio_loss": 0.9526287317276001, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06097535416483879, + "rewards/margins": 0.014204311184585094, + "rewards/rejected": -0.07517966628074646, + "sft_loss": 1.219507098197937, + "step": 1135 + }, + { + "epoch": 0.912, + "grad_norm": 21.962006553169825, + "learning_rate": 4.3924876391293915e-06, + "logits/chosen": -0.5484793782234192, + "logits/rejected": -0.46796178817749023, + "logps/chosen": -1.108927845954895, + "logps/rejected": -1.0250526666641235, + "loss": 1.2947, + "odds_ratio_loss": 0.8142274618148804, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05544638633728027, + "rewards/margins": -0.004193754401057959, + "rewards/rejected": -0.05125263333320618, + "sft_loss": 1.108927845954895, + "step": 1140 + }, + { + "epoch": 0.916, + "grad_norm": 15.332345311015494, + "learning_rate": 4.384864275421109e-06, + "logits/chosen": -0.5269834399223328, + "logits/rejected": -0.967495322227478, + "logps/chosen": -1.288408875465393, + "logps/rejected": -1.6966063976287842, + "loss": 1.2474, + "odds_ratio_loss": 0.5316591262817383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06442044675350189, + "rewards/margins": 0.020409878343343735, + "rewards/rejected": -0.08483032882213593, + "sft_loss": 1.288408875465393, + "step": 1145 + }, + { + "epoch": 0.92, + "grad_norm": 10.130919066814135, + "learning_rate": 4.377200082453748e-06, + "logits/chosen": -0.5598348379135132, + "logits/rejected": -0.8047167062759399, + "logps/chosen": -1.2670339345932007, + "logps/rejected": -1.4398491382598877, + "loss": 1.2942, + "odds_ratio_loss": 0.6326471567153931, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06335169821977615, + "rewards/margins": 0.008640759624540806, + "rewards/rejected": -0.07199246436357498, + "sft_loss": 1.2670339345932007, + "step": 1150 + }, + { + "epoch": 0.924, + "grad_norm": 6.4155914839593855, + "learning_rate": 4.36949522624633e-06, + "logits/chosen": -0.8152793645858765, + "logits/rejected": -0.6962675452232361, + "logps/chosen": -1.3344051837921143, + "logps/rejected": -1.4677374362945557, + "loss": 1.2375, + "odds_ratio_loss": 0.715882420539856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06672026216983795, + "rewards/margins": 0.006666617002338171, + "rewards/rejected": -0.07338687032461166, + "sft_loss": 1.3344051837921143, + "step": 1155 + }, + { + "epoch": 0.928, + "grad_norm": 7.363210525745847, + "learning_rate": 4.361749873698707e-06, + "logits/chosen": -0.5409479737281799, + "logits/rejected": -0.722708523273468, + "logps/chosen": -1.1098625659942627, + "logps/rejected": -1.3639353513717651, + "loss": 1.2489, + "odds_ratio_loss": 0.7049247026443481, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05549313500523567, + "rewards/margins": 0.012703632935881615, + "rewards/rejected": -0.06819676607847214, + "sft_loss": 1.1098625659942627, + "step": 1160 + }, + { + "epoch": 0.932, + "grad_norm": 10.93689863299814, + "learning_rate": 4.353964192587949e-06, + "logits/chosen": -0.7145729064941406, + "logits/rejected": -0.849127471446991, + "logps/chosen": -1.3707650899887085, + "logps/rejected": -1.4907456636428833, + "loss": 1.2284, + "odds_ratio_loss": 0.6671477556228638, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.06853825598955154, + "rewards/margins": 0.005999024957418442, + "rewards/rejected": -0.07453727722167969, + "sft_loss": 1.3707650899887085, + "step": 1165 + }, + { + "epoch": 0.936, + "grad_norm": 6.2833560952473375, + "learning_rate": 4.346138351564711e-06, + "logits/chosen": -0.5584260821342468, + "logits/rejected": -1.0391467809677124, + "logps/chosen": -1.1828453540802002, + "logps/rejected": -1.3176859617233276, + "loss": 1.2247, + "odds_ratio_loss": 0.6230691075325012, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05914226919412613, + "rewards/margins": 0.006742039229720831, + "rewards/rejected": -0.0658842995762825, + "sft_loss": 1.1828453540802002, + "step": 1170 + }, + { + "epoch": 0.94, + "grad_norm": 7.981319588041361, + "learning_rate": 4.338272520149572e-06, + "logits/chosen": -0.48743829131126404, + "logits/rejected": -0.8333004713058472, + "logps/chosen": -1.7361185550689697, + "logps/rejected": -1.4733507633209229, + "loss": 1.2669, + "odds_ratio_loss": 0.959330677986145, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08680592477321625, + "rewards/margins": -0.013138381764292717, + "rewards/rejected": -0.07366754859685898, + "sft_loss": 1.7361185550689697, + "step": 1175 + }, + { + "epoch": 0.944, + "grad_norm": 7.109380757572563, + "learning_rate": 4.330366868729376e-06, + "logits/chosen": -0.22841350734233856, + "logits/rejected": -0.941692054271698, + "logps/chosen": -1.0154824256896973, + "logps/rejected": -1.5618747472763062, + "loss": 1.2833, + "odds_ratio_loss": 0.5690563321113586, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.050774119794368744, + "rewards/margins": 0.02731963060796261, + "rewards/rejected": -0.0780937448143959, + "sft_loss": 1.0154824256896973, + "step": 1180 + }, + { + "epoch": 0.948, + "grad_norm": 13.318206186445765, + "learning_rate": 4.322421568553529e-06, + "logits/chosen": -0.38624149560928345, + "logits/rejected": -1.0594637393951416, + "logps/chosen": -1.07871413230896, + "logps/rejected": -1.562281847000122, + "loss": 1.2659, + "odds_ratio_loss": 0.5322721600532532, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.053935706615448, + "rewards/margins": 0.02417839504778385, + "rewards/rejected": -0.0781140998005867, + "sft_loss": 1.07871413230896, + "step": 1185 + }, + { + "epoch": 0.952, + "grad_norm": 9.11517027692733, + "learning_rate": 4.3144367917302964e-06, + "logits/chosen": -0.8500941395759583, + "logits/rejected": -0.7091385722160339, + "logps/chosen": -1.1739693880081177, + "logps/rejected": -1.5754871368408203, + "loss": 1.2802, + "odds_ratio_loss": 0.6279300451278687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.058698467910289764, + "rewards/margins": 0.02007589302957058, + "rewards/rejected": -0.0787743553519249, + "sft_loss": 1.1739693880081177, + "step": 1190 + }, + { + "epoch": 0.956, + "grad_norm": 7.697301408661275, + "learning_rate": 4.30641271122307e-06, + "logits/chosen": -0.4521522521972656, + "logits/rejected": -0.8235799670219421, + "logps/chosen": -1.2677797079086304, + "logps/rejected": -1.3189507722854614, + "loss": 1.2587, + "odds_ratio_loss": 0.691112756729126, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06338898837566376, + "rewards/margins": 0.002558555454015732, + "rewards/rejected": -0.06594754755496979, + "sft_loss": 1.2677797079086304, + "step": 1195 + }, + { + "epoch": 0.96, + "grad_norm": 9.997266919643533, + "learning_rate": 4.2983495008466285e-06, + "logits/chosen": -0.8174387216567993, + "logits/rejected": -0.7150999307632446, + "logps/chosen": -0.8913282155990601, + "logps/rejected": -1.1885263919830322, + "loss": 1.2154, + "odds_ratio_loss": 0.6406058073043823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04456641152501106, + "rewards/margins": 0.014859904535114765, + "rewards/rejected": -0.05942631885409355, + "sft_loss": 0.8913282155990601, + "step": 1200 + }, + { + "epoch": 0.964, + "grad_norm": 5.221880235557841, + "learning_rate": 4.290247335263362e-06, + "logits/chosen": -1.0014302730560303, + "logits/rejected": -0.9239674806594849, + "logps/chosen": -1.0359283685684204, + "logps/rejected": -1.3768165111541748, + "loss": 1.2759, + "odds_ratio_loss": 0.6447513103485107, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05179642513394356, + "rewards/margins": 0.017044400796294212, + "rewards/rejected": -0.06884082406759262, + "sft_loss": 1.0359283685684204, + "step": 1205 + }, + { + "epoch": 0.968, + "grad_norm": 6.187170258551322, + "learning_rate": 4.2821063899795015e-06, + "logits/chosen": -0.39787670969963074, + "logits/rejected": -1.0457478761672974, + "logps/chosen": -1.1205229759216309, + "logps/rejected": -1.4367693662643433, + "loss": 1.2251, + "odds_ratio_loss": 0.5175265073776245, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0560261532664299, + "rewards/margins": 0.015812328085303307, + "rewards/rejected": -0.07183848321437836, + "sft_loss": 1.1205229759216309, + "step": 1210 + }, + { + "epoch": 0.972, + "grad_norm": 5.076678498483747, + "learning_rate": 4.273926841341303e-06, + "logits/chosen": -0.9177725911140442, + "logits/rejected": -0.9428675770759583, + "logps/chosen": -1.2810909748077393, + "logps/rejected": -1.7306153774261475, + "loss": 1.2798, + "odds_ratio_loss": 0.6016807556152344, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06405454128980637, + "rewards/margins": 0.02247622422873974, + "rewards/rejected": -0.08653075993061066, + "sft_loss": 1.2810909748077393, + "step": 1215 + }, + { + "epoch": 0.976, + "grad_norm": 6.982415041589051, + "learning_rate": 4.265708866531238e-06, + "logits/chosen": -0.8390816450119019, + "logits/rejected": -0.5559083223342896, + "logps/chosen": -0.9034653902053833, + "logps/rejected": -1.1532642841339111, + "loss": 1.2121, + "odds_ratio_loss": 0.5824159383773804, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0451732762157917, + "rewards/margins": 0.01248995028436184, + "rewards/rejected": -0.057663220912218094, + "sft_loss": 0.9034653902053833, + "step": 1220 + }, + { + "epoch": 0.98, + "grad_norm": 8.854119024299864, + "learning_rate": 4.257452643564155e-06, + "logits/chosen": -0.40446987748146057, + "logits/rejected": -0.9087270498275757, + "logps/chosen": -0.8603678941726685, + "logps/rejected": -1.3758715391159058, + "loss": 1.1785, + "odds_ratio_loss": 0.416604608297348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0430183932185173, + "rewards/margins": 0.025775188580155373, + "rewards/rejected": -0.06879357993602753, + "sft_loss": 0.8603678941726685, + "step": 1225 + }, + { + "epoch": 0.984, + "grad_norm": 8.750528387170686, + "learning_rate": 4.249158351283414e-06, + "logits/chosen": -1.0308736562728882, + "logits/rejected": -0.8786516189575195, + "logps/chosen": -0.9902039766311646, + "logps/rejected": -1.3153424263000488, + "loss": 1.2155, + "odds_ratio_loss": 0.4901192784309387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04951019585132599, + "rewards/margins": 0.016256922855973244, + "rewards/rejected": -0.06576712429523468, + "sft_loss": 0.9902039766311646, + "step": 1230 + }, + { + "epoch": 0.988, + "grad_norm": 10.866038696658444, + "learning_rate": 4.240826169357024e-06, + "logits/chosen": -1.0902023315429688, + "logits/rejected": -1.481338620185852, + "logps/chosen": -1.419777750968933, + "logps/rejected": -1.2114101648330688, + "loss": 1.3133, + "odds_ratio_loss": 0.9872404932975769, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07098889350891113, + "rewards/margins": -0.01041838526725769, + "rewards/rejected": -0.06057050824165344, + "sft_loss": 1.419777750968933, + "step": 1235 + }, + { + "epoch": 0.992, + "grad_norm": 7.937224210186921, + "learning_rate": 4.232456278273743e-06, + "logits/chosen": -0.6265432834625244, + "logits/rejected": -0.510567843914032, + "logps/chosen": -1.0876511335372925, + "logps/rejected": -1.4437576532363892, + "loss": 1.1532, + "odds_ratio_loss": 0.5487557649612427, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.054382555186748505, + "rewards/margins": 0.017805328592658043, + "rewards/rejected": -0.0721878856420517, + "sft_loss": 1.0876511335372925, + "step": 1240 + }, + { + "epoch": 0.996, + "grad_norm": 18.05729223024994, + "learning_rate": 4.224048859339175e-06, + "logits/chosen": -0.4967614710330963, + "logits/rejected": -0.7329981327056885, + "logps/chosen": -1.0347087383270264, + "logps/rejected": -1.6826976537704468, + "loss": 1.3198, + "odds_ratio_loss": 0.5197392702102661, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.051735442131757736, + "rewards/margins": 0.03239942714571953, + "rewards/rejected": -0.08413486927747726, + "sft_loss": 1.0347087383270264, + "step": 1245 + }, + { + "epoch": 1.0, + "grad_norm": 8.219444479887624, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -0.5567103624343872, + "logits/rejected": -0.5455074310302734, + "logps/chosen": -1.3113296031951904, + "logps/rejected": -1.5509188175201416, + "loss": 1.1873, + "odds_ratio_loss": 0.6092044711112976, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06556646525859833, + "rewards/margins": 0.01197946909815073, + "rewards/rejected": -0.07754594087600708, + "sft_loss": 1.3113296031951904, + "step": 1250 + }, + { + "epoch": 1.004, + "grad_norm": 6.382865998528307, + "learning_rate": 4.207122167199209e-06, + "logits/chosen": -0.6577066779136658, + "logits/rejected": -0.7173200845718384, + "logps/chosen": -0.8776922225952148, + "logps/rejected": -1.7570756673812866, + "loss": 0.8296, + "odds_ratio_loss": 0.33190062642097473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04388461261987686, + "rewards/margins": 0.04396916925907135, + "rewards/rejected": -0.08785378187894821, + "sft_loss": 0.8776922225952148, + "step": 1255 + }, + { + "epoch": 1.008, + "grad_norm": 5.105708309165987, + "learning_rate": 4.198603260653792e-06, + "logits/chosen": -0.5839263796806335, + "logits/rejected": -0.866306483745575, + "logps/chosen": -0.8336095809936523, + "logps/rejected": -1.26575767993927, + "loss": 0.8358, + "odds_ratio_loss": 0.46527212858200073, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.041680481284856796, + "rewards/margins": 0.021607402712106705, + "rewards/rejected": -0.0632878839969635, + "sft_loss": 0.8336095809936523, + "step": 1260 + }, + { + "epoch": 1.012, + "grad_norm": 4.8302181441829966, + "learning_rate": 4.1900475595691044e-06, + "logits/chosen": -0.599847674369812, + "logits/rejected": -1.0592182874679565, + "logps/chosen": -0.5498633980751038, + "logps/rejected": -1.3975508213043213, + "loss": 0.7752, + "odds_ratio_loss": 0.25286543369293213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027493169531226158, + "rewards/margins": 0.042384374886751175, + "rewards/rejected": -0.06987754255533218, + "sft_loss": 0.5498633980751038, + "step": 1265 + }, + { + "epoch": 1.016, + "grad_norm": 6.056131985323491, + "learning_rate": 4.181455249275701e-06, + "logits/chosen": -0.2955573499202728, + "logits/rejected": -0.5213496088981628, + "logps/chosen": -0.7416674494743347, + "logps/rejected": -0.9686793088912964, + "loss": 0.8138, + "odds_ratio_loss": 0.6542133092880249, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.037083376199007034, + "rewards/margins": 0.011350592598319054, + "rewards/rejected": -0.04843396693468094, + "sft_loss": 0.7416674494743347, + "step": 1270 + }, + { + "epoch": 1.02, + "grad_norm": 5.064697244932875, + "learning_rate": 4.172826515897146e-06, + "logits/chosen": -0.6576083302497864, + "logits/rejected": -0.812027096748352, + "logps/chosen": -0.6591814160346985, + "logps/rejected": -1.5505030155181885, + "loss": 0.7844, + "odds_ratio_loss": 0.30150288343429565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03295907378196716, + "rewards/margins": 0.0445660725235939, + "rewards/rejected": -0.07752513885498047, + "sft_loss": 0.6591814160346985, + "step": 1275 + }, + { + "epoch": 1.024, + "grad_norm": 5.807191452802592, + "learning_rate": 4.1641615463459926e-06, + "logits/chosen": -0.4745884835720062, + "logits/rejected": -0.4976702332496643, + "logps/chosen": -0.6141924858093262, + "logps/rejected": -1.0756144523620605, + "loss": 0.7885, + "odds_ratio_loss": 0.458048015832901, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03070961870253086, + "rewards/margins": 0.023071100935339928, + "rewards/rejected": -0.05378072336316109, + "sft_loss": 0.6141924858093262, + "step": 1280 + }, + { + "epoch": 1.028, + "grad_norm": 5.371739030260733, + "learning_rate": 4.1554605283197255e-06, + "logits/chosen": -0.8033881187438965, + "logits/rejected": -0.8408336639404297, + "logps/chosen": -0.478267103433609, + "logps/rejected": -1.3151143789291382, + "loss": 0.8676, + "odds_ratio_loss": 0.30922359228134155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02391335740685463, + "rewards/margins": 0.04184236004948616, + "rewards/rejected": -0.06575571000576019, + "sft_loss": 0.478267103433609, + "step": 1285 + }, + { + "epoch": 1.032, + "grad_norm": 6.568180405098848, + "learning_rate": 4.146723650296701e-06, + "logits/chosen": -0.6499618887901306, + "logits/rejected": -0.931594967842102, + "logps/chosen": -1.0891497135162354, + "logps/rejected": -1.2812997102737427, + "loss": 0.9537, + "odds_ratio_loss": 0.7874979972839355, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05445748567581177, + "rewards/margins": 0.009607489220798016, + "rewards/rejected": -0.06406497955322266, + "sft_loss": 1.0891497135162354, + "step": 1290 + }, + { + "epoch": 1.036, + "grad_norm": 8.474892928899138, + "learning_rate": 4.1379511015320625e-06, + "logits/chosen": -0.4418262839317322, + "logits/rejected": -0.8559747934341431, + "logps/chosen": -0.9219955205917358, + "logps/rejected": -1.7327539920806885, + "loss": 0.8688, + "odds_ratio_loss": 0.3014797568321228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04609977453947067, + "rewards/margins": 0.04053793102502823, + "rewards/rejected": -0.0866376981139183, + "sft_loss": 0.9219955205917358, + "step": 1295 + }, + { + "epoch": 1.04, + "grad_norm": 5.782082477060897, + "learning_rate": 4.129143072053639e-06, + "logits/chosen": -0.7809382677078247, + "logits/rejected": -0.8675028085708618, + "logps/chosen": -0.8315264582633972, + "logps/rejected": -1.3996779918670654, + "loss": 0.899, + "odds_ratio_loss": 0.3759022355079651, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0415763296186924, + "rewards/margins": 0.02840757742524147, + "rewards/rejected": -0.06998389959335327, + "sft_loss": 0.8315264582633972, + "step": 1300 + }, + { + "epoch": 1.044, + "grad_norm": 6.243618609939396, + "learning_rate": 4.120299752657828e-06, + "logits/chosen": -0.7213522791862488, + "logits/rejected": -0.5738562345504761, + "logps/chosen": -0.927670955657959, + "logps/rejected": -1.4523080587387085, + "loss": 0.8292, + "odds_ratio_loss": 0.45550140738487244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04638354852795601, + "rewards/margins": 0.026231860741972923, + "rewards/rejected": -0.07261540740728378, + "sft_loss": 0.927670955657959, + "step": 1305 + }, + { + "epoch": 1.048, + "grad_norm": 9.073750027816232, + "learning_rate": 4.111421334905468e-06, + "logits/chosen": -0.6499530673027039, + "logits/rejected": -0.8254686594009399, + "logps/chosen": -0.9249345660209656, + "logps/rejected": -1.39388906955719, + "loss": 0.7753, + "odds_ratio_loss": 0.43550366163253784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0462467297911644, + "rewards/margins": 0.023447733372449875, + "rewards/rejected": -0.06969445943832397, + "sft_loss": 0.9249345660209656, + "step": 1310 + }, + { + "epoch": 1.052, + "grad_norm": 10.11337598302246, + "learning_rate": 4.102508011117684e-06, + "logits/chosen": -0.44864311814308167, + "logits/rejected": -0.7308858633041382, + "logps/chosen": -0.9878190755844116, + "logps/rejected": -1.7209956645965576, + "loss": 0.7796, + "odds_ratio_loss": 0.357673704624176, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04939096421003342, + "rewards/margins": 0.03665883094072342, + "rewards/rejected": -0.08604978024959564, + "sft_loss": 0.9878190755844116, + "step": 1315 + }, + { + "epoch": 1.056, + "grad_norm": 4.673425946179824, + "learning_rate": 4.093559974371725e-06, + "logits/chosen": -0.6861502528190613, + "logits/rejected": -0.45734572410583496, + "logps/chosen": -0.5693352818489075, + "logps/rejected": -1.291886568069458, + "loss": 0.8203, + "odds_ratio_loss": 0.2426222562789917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028466764837503433, + "rewards/margins": 0.036127571016550064, + "rewards/rejected": -0.0645943284034729, + "sft_loss": 0.5693352818489075, + "step": 1320 + }, + { + "epoch": 1.06, + "grad_norm": 5.847314097304619, + "learning_rate": 4.084577418496775e-06, + "logits/chosen": -0.8275073766708374, + "logits/rejected": -0.6742109060287476, + "logps/chosen": -0.5419312119483948, + "logps/rejected": -1.4742541313171387, + "loss": 0.8365, + "odds_ratio_loss": 0.3019058406352997, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02709656022489071, + "rewards/margins": 0.046616148203611374, + "rewards/rejected": -0.07371271401643753, + "sft_loss": 0.5419312119483948, + "step": 1325 + }, + { + "epoch": 1.064, + "grad_norm": 4.856747702455789, + "learning_rate": 4.075560538069767e-06, + "logits/chosen": -0.5532726049423218, + "logits/rejected": -0.6589607000350952, + "logps/chosen": -0.7456861734390259, + "logps/rejected": -1.2630527019500732, + "loss": 0.8495, + "odds_ratio_loss": 0.54130619764328, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03728431463241577, + "rewards/margins": 0.025868326425552368, + "rewards/rejected": -0.06315263360738754, + "sft_loss": 0.7456861734390259, + "step": 1330 + }, + { + "epoch": 1.068, + "grad_norm": 7.765532059632631, + "learning_rate": 4.066509528411151e-06, + "logits/chosen": -0.5705880522727966, + "logits/rejected": -0.7791796922683716, + "logps/chosen": -0.867768406867981, + "logps/rejected": -1.508888602256775, + "loss": 0.8959, + "odds_ratio_loss": 0.4603050649166107, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04338841885328293, + "rewards/margins": 0.032056018710136414, + "rewards/rejected": -0.07544443756341934, + "sft_loss": 0.867768406867981, + "step": 1335 + }, + { + "epoch": 1.072, + "grad_norm": 12.916315689810858, + "learning_rate": 4.05742458558068e-06, + "logits/chosen": -0.8875905275344849, + "logits/rejected": -0.7213624119758606, + "logps/chosen": -0.7063192129135132, + "logps/rejected": -1.434460997581482, + "loss": 0.7654, + "odds_ratio_loss": 0.3284406065940857, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03531596064567566, + "rewards/margins": 0.03640709072351456, + "rewards/rejected": -0.07172305136919022, + "sft_loss": 0.7063192129135132, + "step": 1340 + }, + { + "epoch": 1.076, + "grad_norm": 7.30216329780713, + "learning_rate": 4.048305906373151e-06, + "logits/chosen": -0.6724140644073486, + "logits/rejected": -0.686550498008728, + "logps/chosen": -0.9202359914779663, + "logps/rejected": -1.229242205619812, + "loss": 0.8455, + "odds_ratio_loss": 0.5194590091705322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.046011798083782196, + "rewards/margins": 0.015450313687324524, + "rewards/rejected": -0.06146211549639702, + "sft_loss": 0.9202359914779663, + "step": 1345 + }, + { + "epoch": 1.08, + "grad_norm": 7.376368865831852, + "learning_rate": 4.039153688314146e-06, + "logits/chosen": -0.6789649724960327, + "logits/rejected": -0.7698289155960083, + "logps/chosen": -1.0499086380004883, + "logps/rejected": -1.4064363241195679, + "loss": 0.8224, + "odds_ratio_loss": 0.5207637548446655, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05249543860554695, + "rewards/margins": 0.017826389521360397, + "rewards/rejected": -0.07032182067632675, + "sft_loss": 1.0499086380004883, + "step": 1350 + }, + { + "epoch": 1.084, + "grad_norm": 10.69540570354578, + "learning_rate": 4.029968129655757e-06, + "logits/chosen": -0.7770857214927673, + "logits/rejected": -0.9475702047348022, + "logps/chosen": -0.7179058790206909, + "logps/rejected": -1.5098683834075928, + "loss": 0.8206, + "odds_ratio_loss": 0.33577170968055725, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.035895295441150665, + "rewards/margins": 0.03959812596440315, + "rewards/rejected": -0.07549341022968292, + "sft_loss": 0.7179058790206909, + "step": 1355 + }, + { + "epoch": 1.088, + "grad_norm": 8.204731502525116, + "learning_rate": 4.020749429372286e-06, + "logits/chosen": -0.851569652557373, + "logits/rejected": -0.98048335313797, + "logps/chosen": -0.7297436594963074, + "logps/rejected": -1.4111974239349365, + "loss": 0.8462, + "odds_ratio_loss": 0.3803647458553314, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.036487188190221786, + "rewards/margins": 0.03407268971204758, + "rewards/rejected": -0.07055987417697906, + "sft_loss": 0.7297436594963074, + "step": 1360 + }, + { + "epoch": 1.092, + "grad_norm": 8.235008694291277, + "learning_rate": 4.011497787155938e-06, + "logits/chosen": -0.7461971044540405, + "logits/rejected": -1.4180670976638794, + "logps/chosen": -0.9519845843315125, + "logps/rejected": -1.9593369960784912, + "loss": 0.9038, + "odds_ratio_loss": 0.356905996799469, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04759923368692398, + "rewards/margins": 0.050367631018161774, + "rewards/rejected": -0.09796686470508575, + "sft_loss": 0.9519845843315125, + "step": 1365 + }, + { + "epoch": 1.096, + "grad_norm": 15.389585033573624, + "learning_rate": 4.002213403412492e-06, + "logits/chosen": -0.6717543005943298, + "logits/rejected": -0.9089921116828918, + "logps/chosen": -1.0161564350128174, + "logps/rejected": -1.3220138549804688, + "loss": 0.8947, + "odds_ratio_loss": 0.5496761202812195, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05080781504511833, + "rewards/margins": 0.01529287826269865, + "rewards/rejected": -0.06610070168972015, + "sft_loss": 1.0161564350128174, + "step": 1370 + }, + { + "epoch": 1.1, + "grad_norm": 7.445613929034652, + "learning_rate": 3.992896479256966e-06, + "logits/chosen": -0.680860161781311, + "logits/rejected": -0.6606763601303101, + "logps/chosen": -0.6550648808479309, + "logps/rejected": -1.352782964706421, + "loss": 0.8223, + "odds_ratio_loss": 0.3205656111240387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.032753244042396545, + "rewards/margins": 0.03488590195775032, + "rewards/rejected": -0.06763914227485657, + "sft_loss": 0.6550648808479309, + "step": 1375 + }, + { + "epoch": 1.104, + "grad_norm": 5.348676916256749, + "learning_rate": 3.983547216509254e-06, + "logits/chosen": -0.5789454579353333, + "logits/rejected": -1.2544965744018555, + "logps/chosen": -0.796095609664917, + "logps/rejected": -1.5642998218536377, + "loss": 0.7846, + "odds_ratio_loss": 0.32175213098526, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03980477899312973, + "rewards/margins": 0.038410209119319916, + "rewards/rejected": -0.07821498811244965, + "sft_loss": 0.796095609664917, + "step": 1380 + }, + { + "epoch": 1.108, + "grad_norm": 11.505326821901532, + "learning_rate": 3.974165817689758e-06, + "logits/chosen": -0.956534206867218, + "logits/rejected": -1.1092783212661743, + "logps/chosen": -1.0736840963363647, + "logps/rejected": -1.5605031251907349, + "loss": 0.8767, + "odds_ratio_loss": 0.49855270981788635, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05368421599268913, + "rewards/margins": 0.02434094250202179, + "rewards/rejected": -0.07802516222000122, + "sft_loss": 1.0736840963363647, + "step": 1385 + }, + { + "epoch": 1.112, + "grad_norm": 7.11996683938393, + "learning_rate": 3.964752486015001e-06, + "logits/chosen": -0.702655553817749, + "logits/rejected": -1.0277684926986694, + "logps/chosen": -0.8141889572143555, + "logps/rejected": -1.9155117273330688, + "loss": 0.7498, + "odds_ratio_loss": 0.26831483840942383, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04070945456624031, + "rewards/margins": 0.05506613105535507, + "rewards/rejected": -0.09577558934688568, + "sft_loss": 0.8141889572143555, + "step": 1390 + }, + { + "epoch": 1.116, + "grad_norm": 5.797590260170669, + "learning_rate": 3.955307425393224e-06, + "logits/chosen": -0.7000407576560974, + "logits/rejected": -0.9680187106132507, + "logps/chosen": -0.9160090684890747, + "logps/rejected": -1.364323616027832, + "loss": 0.8904, + "odds_ratio_loss": 0.45110782980918884, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.045800451189279556, + "rewards/margins": 0.022415729239583015, + "rewards/rejected": -0.06821618974208832, + "sft_loss": 0.9160090684890747, + "step": 1395 + }, + { + "epoch": 1.12, + "grad_norm": 8.433254983665345, + "learning_rate": 3.945830840419966e-06, + "logits/chosen": -0.7009550929069519, + "logits/rejected": -1.0051246881484985, + "logps/chosen": -0.4989989697933197, + "logps/rejected": -1.407712697982788, + "loss": 0.7925, + "odds_ratio_loss": 0.27720946073532104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024949947372078896, + "rewards/margins": 0.045435696840286255, + "rewards/rejected": -0.07038564234972, + "sft_loss": 0.4989989697933197, + "step": 1400 + }, + { + "epoch": 1.124, + "grad_norm": 12.99123162983583, + "learning_rate": 3.936322936373641e-06, + "logits/chosen": -0.6947557330131531, + "logits/rejected": -0.905806839466095, + "logps/chosen": -0.9586040377616882, + "logps/rejected": -1.2324825525283813, + "loss": 0.8352, + "odds_ratio_loss": 0.5849367380142212, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04793020337820053, + "rewards/margins": 0.013693928718566895, + "rewards/rejected": -0.06162412837147713, + "sft_loss": 0.9586040377616882, + "step": 1405 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 8.170518488538953, + "learning_rate": 3.92678391921108e-06, + "logits/chosen": -1.1551212072372437, + "logits/rejected": -0.8484439849853516, + "logps/chosen": -0.5776058435440063, + "logps/rejected": -1.4134206771850586, + "loss": 0.8291, + "odds_ratio_loss": 0.31984660029411316, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.028880292549729347, + "rewards/margins": 0.04179074615240097, + "rewards/rejected": -0.07067102938890457, + "sft_loss": 0.5776058435440063, + "step": 1410 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 8.09496861727856, + "learning_rate": 3.9172139955630774e-06, + "logits/chosen": -0.713034987449646, + "logits/rejected": -0.9983466863632202, + "logps/chosen": -0.6458183526992798, + "logps/rejected": -1.828302025794983, + "loss": 0.866, + "odds_ratio_loss": 0.30388593673706055, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03229091316461563, + "rewards/margins": 0.0591241791844368, + "rewards/rejected": -0.09141509234905243, + "sft_loss": 0.6458183526992798, + "step": 1415 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 16.205423769664655, + "learning_rate": 3.907613372729916e-06, + "logits/chosen": -0.6726616024971008, + "logits/rejected": -1.3900980949401855, + "logps/chosen": -0.7454610466957092, + "logps/rejected": -2.1414051055908203, + "loss": 0.8471, + "odds_ratio_loss": 0.2859545350074768, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03727305680513382, + "rewards/margins": 0.06979719549417496, + "rewards/rejected": -0.10707025229930878, + "sft_loss": 0.7454610466957092, + "step": 1420 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 10.269281839827107, + "learning_rate": 3.897982258676867e-06, + "logits/chosen": -0.7214896082878113, + "logits/rejected": -0.8336132168769836, + "logps/chosen": -0.9054323434829712, + "logps/rejected": -2.1648507118225098, + "loss": 0.9165, + "odds_ratio_loss": 0.38728705048561096, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0452716164290905, + "rewards/margins": 0.06297094374895096, + "rewards/rejected": -0.10824254900217056, + "sft_loss": 0.9054323434829712, + "step": 1425 + }, + { + "epoch": 1.144, + "grad_norm": 6.605575368297014, + "learning_rate": 3.888320862029699e-06, + "logits/chosen": -0.5836726427078247, + "logits/rejected": -0.7385787963867188, + "logps/chosen": -0.9469176530838013, + "logps/rejected": -1.4403495788574219, + "loss": 0.8601, + "odds_ratio_loss": 0.4318571984767914, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04734588414430618, + "rewards/margins": 0.02467160113155842, + "rewards/rejected": -0.07201748341321945, + "sft_loss": 0.9469176530838013, + "step": 1430 + }, + { + "epoch": 1.148, + "grad_norm": 5.846266947718309, + "learning_rate": 3.878629392070143e-06, + "logits/chosen": -0.7644879817962646, + "logits/rejected": -0.9813248515129089, + "logps/chosen": -0.8727389574050903, + "logps/rejected": -1.5859434604644775, + "loss": 0.8655, + "odds_ratio_loss": 0.47635746002197266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.043636951595544815, + "rewards/margins": 0.03566022962331772, + "rewards/rejected": -0.07929717004299164, + "sft_loss": 0.8727389574050903, + "step": 1435 + }, + { + "epoch": 1.152, + "grad_norm": 8.026595177727547, + "learning_rate": 3.868908058731376e-06, + "logits/chosen": -1.0093854665756226, + "logits/rejected": -0.8972814679145813, + "logps/chosen": -0.641372799873352, + "logps/rejected": -1.255726933479309, + "loss": 0.7964, + "odds_ratio_loss": 0.33428263664245605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0320686399936676, + "rewards/margins": 0.030717704445123672, + "rewards/rejected": -0.06278634071350098, + "sft_loss": 0.641372799873352, + "step": 1440 + }, + { + "epoch": 1.156, + "grad_norm": 7.131166323627883, + "learning_rate": 3.859157072593459e-06, + "logits/chosen": -0.8095647096633911, + "logits/rejected": -1.309104323387146, + "logps/chosen": -0.9154605865478516, + "logps/rejected": -1.6045910120010376, + "loss": 0.8756, + "odds_ratio_loss": 0.3739989399909973, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04577303305268288, + "rewards/margins": 0.0344565287232399, + "rewards/rejected": -0.08022955805063248, + "sft_loss": 0.9154605865478516, + "step": 1445 + }, + { + "epoch": 1.16, + "grad_norm": 11.191454450304596, + "learning_rate": 3.849376644878783e-06, + "logits/chosen": -0.7892749905586243, + "logits/rejected": -0.7882648706436157, + "logps/chosen": -1.122315764427185, + "logps/rejected": -1.2966439723968506, + "loss": 0.8137, + "odds_ratio_loss": 0.6968892812728882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05611578747630119, + "rewards/margins": 0.008716410025954247, + "rewards/rejected": -0.06483219563961029, + "sft_loss": 1.122315764427185, + "step": 1450 + }, + { + "epoch": 1.164, + "grad_norm": 9.51974427968147, + "learning_rate": 3.839566987447492e-06, + "logits/chosen": -1.2010838985443115, + "logits/rejected": -0.9590059518814087, + "logps/chosen": -0.6959027051925659, + "logps/rejected": -1.873453140258789, + "loss": 0.831, + "odds_ratio_loss": 0.36710458993911743, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0347951278090477, + "rewards/margins": 0.05887751653790474, + "rewards/rejected": -0.09367264807224274, + "sft_loss": 0.6959027051925659, + "step": 1455 + }, + { + "epoch": 1.168, + "grad_norm": 6.6227065501638815, + "learning_rate": 3.829728312792895e-06, + "logits/chosen": -0.8780848383903503, + "logits/rejected": -0.8581530451774597, + "logps/chosen": -0.9142974019050598, + "logps/rejected": -1.3824703693389893, + "loss": 0.78, + "odds_ratio_loss": 0.4631693363189697, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04571487009525299, + "rewards/margins": 0.023408645763993263, + "rewards/rejected": -0.0691235214471817, + "sft_loss": 0.9142974019050598, + "step": 1460 + }, + { + "epoch": 1.172, + "grad_norm": 6.363999703267043, + "learning_rate": 3.819860834036859e-06, + "logits/chosen": -0.5977569222450256, + "logits/rejected": -0.7959061861038208, + "logps/chosen": -1.1498180627822876, + "logps/rejected": -1.42813241481781, + "loss": 0.8887, + "odds_ratio_loss": 0.5259816646575928, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05749090388417244, + "rewards/margins": 0.013915717601776123, + "rewards/rejected": -0.07140661776065826, + "sft_loss": 1.1498180627822876, + "step": 1465 + }, + { + "epoch": 1.176, + "grad_norm": 6.246391614781854, + "learning_rate": 3.8099647649251984e-06, + "logits/chosen": -0.7605501413345337, + "logits/rejected": -1.1908546686172485, + "logps/chosen": -0.8100579380989075, + "logps/rejected": -1.775541067123413, + "loss": 0.8348, + "odds_ratio_loss": 0.42932015657424927, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04050289839506149, + "rewards/margins": 0.04827415570616722, + "rewards/rejected": -0.08877705037593842, + "sft_loss": 0.8100579380989075, + "step": 1470 + }, + { + "epoch": 1.18, + "grad_norm": 5.893619858051333, + "learning_rate": 3.8000403198230385e-06, + "logits/chosen": -0.787771463394165, + "logits/rejected": -1.0664321184158325, + "logps/chosen": -0.8280608057975769, + "logps/rejected": -1.8852030038833618, + "loss": 0.822, + "odds_ratio_loss": 0.3562834858894348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.041403044015169144, + "rewards/margins": 0.05285710096359253, + "rewards/rejected": -0.09426014125347137, + "sft_loss": 0.8280608057975769, + "step": 1475 + }, + { + "epoch": 1.184, + "grad_norm": 7.183406495752281, + "learning_rate": 3.790087713710179e-06, + "logits/chosen": -0.7213200330734253, + "logits/rejected": -0.6918378472328186, + "logps/chosen": -0.5293585658073425, + "logps/rejected": -1.8362514972686768, + "loss": 0.7973, + "odds_ratio_loss": 0.19014953076839447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026467930525541306, + "rewards/margins": 0.06534464657306671, + "rewards/rejected": -0.09181257337331772, + "sft_loss": 0.5293585658073425, + "step": 1480 + }, + { + "epoch": 1.188, + "grad_norm": 17.119549825566487, + "learning_rate": 3.780107162176429e-06, + "logits/chosen": -0.6820231676101685, + "logits/rejected": -1.1885521411895752, + "logps/chosen": -0.7856122255325317, + "logps/rejected": -1.2408835887908936, + "loss": 0.8931, + "odds_ratio_loss": 0.4235461354255676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03928060457110405, + "rewards/margins": 0.022763576358556747, + "rewards/rejected": -0.0620441809296608, + "sft_loss": 0.7856122255325317, + "step": 1485 + }, + { + "epoch": 1.192, + "grad_norm": 6.561999759540379, + "learning_rate": 3.770098881416945e-06, + "logits/chosen": -0.8763113021850586, + "logits/rejected": -1.155576229095459, + "logps/chosen": -0.5533097386360168, + "logps/rejected": -1.3931071758270264, + "loss": 0.8147, + "odds_ratio_loss": 0.2342720478773117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02766549028456211, + "rewards/margins": 0.041989874094724655, + "rewards/rejected": -0.06965536624193192, + "sft_loss": 0.5533097386360168, + "step": 1490 + }, + { + "epoch": 1.196, + "grad_norm": 5.852667265636353, + "learning_rate": 3.760063088227542e-06, + "logits/chosen": -0.7204443216323853, + "logits/rejected": -1.0452661514282227, + "logps/chosen": -0.6752656698226929, + "logps/rejected": -1.6966698169708252, + "loss": 0.842, + "odds_ratio_loss": 0.3053438067436218, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.033763282001018524, + "rewards/margins": 0.0510701946914196, + "rewards/rejected": -0.08483348786830902, + "sft_loss": 0.6752656698226929, + "step": 1495 + }, + { + "epoch": 1.2, + "grad_norm": 7.666952370293206, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.2530326843261719, + "logits/rejected": -1.17472243309021, + "logps/chosen": -0.8737776875495911, + "logps/rejected": -1.5337140560150146, + "loss": 0.8295, + "odds_ratio_loss": 0.4370272159576416, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.043688882142305374, + "rewards/margins": 0.032996825873851776, + "rewards/rejected": -0.07668570429086685, + "sft_loss": 0.8737776875495911, + "step": 1500 + }, + { + "epoch": 1.204, + "grad_norm": 8.643591741745004, + "learning_rate": 3.739909834717356e-06, + "logits/chosen": -1.0214110612869263, + "logits/rejected": -1.1113232374191284, + "logps/chosen": -1.0257785320281982, + "logps/rejected": -1.4637627601623535, + "loss": 0.84, + "odds_ratio_loss": 0.4033835530281067, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.05128893256187439, + "rewards/margins": 0.021899200975894928, + "rewards/rejected": -0.07318813353776932, + "sft_loss": 1.0257785320281982, + "step": 1505 + }, + { + "epoch": 1.208, + "grad_norm": 6.581431751480548, + "learning_rate": 3.7297928109491765e-06, + "logits/chosen": -0.735799252986908, + "logits/rejected": -0.750560462474823, + "logps/chosen": -0.838580310344696, + "logps/rejected": -1.3117971420288086, + "loss": 0.8373, + "odds_ratio_loss": 0.4745975434780121, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04192901775240898, + "rewards/margins": 0.023660842329263687, + "rewards/rejected": -0.06558986008167267, + "sft_loss": 0.838580310344696, + "step": 1510 + }, + { + "epoch": 1.212, + "grad_norm": 8.38248048628384, + "learning_rate": 3.7196491478468322e-06, + "logits/chosen": -0.7478394508361816, + "logits/rejected": -0.9995372891426086, + "logps/chosen": -0.8661327362060547, + "logps/rejected": -1.3012148141860962, + "loss": 0.7344, + "odds_ratio_loss": 0.4485481381416321, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043306637555360794, + "rewards/margins": 0.021754100918769836, + "rewards/rejected": -0.06506074219942093, + "sft_loss": 0.8661327362060547, + "step": 1515 + }, + { + "epoch": 1.216, + "grad_norm": 7.766215821807629, + "learning_rate": 3.7094790651387414e-06, + "logits/chosen": -0.7365530729293823, + "logits/rejected": -0.8527762293815613, + "logps/chosen": -0.9115310907363892, + "logps/rejected": -1.5248000621795654, + "loss": 0.8285, + "odds_ratio_loss": 0.4467516839504242, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0455765500664711, + "rewards/margins": 0.030663449317216873, + "rewards/rejected": -0.07623999565839767, + "sft_loss": 0.9115310907363892, + "step": 1520 + }, + { + "epoch": 1.22, + "grad_norm": 6.4955437270373135, + "learning_rate": 3.699282783125616e-06, + "logits/chosen": -0.7742820978164673, + "logits/rejected": -0.937674880027771, + "logps/chosen": -0.8425240516662598, + "logps/rejected": -1.2824028730392456, + "loss": 0.779, + "odds_ratio_loss": 0.4461577534675598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04212620481848717, + "rewards/margins": 0.021993935108184814, + "rewards/rejected": -0.06412014365196228, + "sft_loss": 0.8425240516662598, + "step": 1525 + }, + { + "epoch": 1.224, + "grad_norm": 6.362003415761398, + "learning_rate": 3.689060522675689e-06, + "logits/chosen": -0.7660902142524719, + "logits/rejected": -1.5068974494934082, + "logps/chosen": -0.6618943810462952, + "logps/rejected": -1.425063133239746, + "loss": 0.845, + "odds_ratio_loss": 0.4632844030857086, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03309471905231476, + "rewards/margins": 0.038158439099788666, + "rewards/rejected": -0.07125315815210342, + "sft_loss": 0.6618943810462952, + "step": 1530 + }, + { + "epoch": 1.228, + "grad_norm": 6.895766693871366, + "learning_rate": 3.6788125052199264e-06, + "logits/chosen": -0.7660204172134399, + "logits/rejected": -1.0502755641937256, + "logps/chosen": -0.8439900279045105, + "logps/rejected": -1.3339447975158691, + "loss": 0.8453, + "odds_ratio_loss": 0.40678563714027405, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.042199499905109406, + "rewards/margins": 0.024497732520103455, + "rewards/rejected": -0.06669723987579346, + "sft_loss": 0.8439900279045105, + "step": 1535 + }, + { + "epoch": 1.232, + "grad_norm": 7.095832854877261, + "learning_rate": 3.668538952747236e-06, + "logits/chosen": -0.8988308906555176, + "logits/rejected": -1.370678186416626, + "logps/chosen": -0.7767564058303833, + "logps/rejected": -0.9345728158950806, + "loss": 0.8361, + "odds_ratio_loss": 0.6314736008644104, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03883781656622887, + "rewards/margins": 0.007890825159847736, + "rewards/rejected": -0.04672864452004433, + "sft_loss": 0.7767564058303833, + "step": 1540 + }, + { + "epoch": 1.236, + "grad_norm": 15.297702452409231, + "learning_rate": 3.658240087799655e-06, + "logits/chosen": -1.128368854522705, + "logits/rejected": -0.9741285443305969, + "logps/chosen": -0.9071061015129089, + "logps/rejected": -1.4348928928375244, + "loss": 0.8299, + "odds_ratio_loss": 0.4918842315673828, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04535530507564545, + "rewards/margins": 0.026389339938759804, + "rewards/rejected": -0.0717446431517601, + "sft_loss": 0.9071061015129089, + "step": 1545 + }, + { + "epoch": 1.24, + "grad_norm": 15.626108640705025, + "learning_rate": 3.6479161334675294e-06, + "logits/chosen": -0.7529286742210388, + "logits/rejected": -0.9413396716117859, + "logps/chosen": -0.6087412238121033, + "logps/rejected": -1.2803432941436768, + "loss": 0.7919, + "odds_ratio_loss": 0.30838295817375183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030437063425779343, + "rewards/margins": 0.033580102026462555, + "rewards/rejected": -0.0640171617269516, + "sft_loss": 0.6087412238121033, + "step": 1550 + }, + { + "epoch": 1.244, + "grad_norm": 7.081745221757999, + "learning_rate": 3.6375673133846847e-06, + "logits/chosen": -0.8344209790229797, + "logits/rejected": -0.9608109593391418, + "logps/chosen": -1.0645172595977783, + "logps/rejected": -1.4052292108535767, + "loss": 0.8832, + "odds_ratio_loss": 0.5416877269744873, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05322585627436638, + "rewards/margins": 0.017035599797964096, + "rewards/rejected": -0.07026146352291107, + "sft_loss": 1.0645172595977783, + "step": 1555 + }, + { + "epoch": 1.248, + "grad_norm": 7.328229540786072, + "learning_rate": 3.627193851723577e-06, + "logits/chosen": -0.8219780921936035, + "logits/rejected": -1.0412352085113525, + "logps/chosen": -0.8243353962898254, + "logps/rejected": -1.1440980434417725, + "loss": 0.7661, + "odds_ratio_loss": 0.48546719551086426, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04121676832437515, + "rewards/margins": 0.01598813384771347, + "rewards/rejected": -0.05720490217208862, + "sft_loss": 0.8243353962898254, + "step": 1560 + }, + { + "epoch": 1.252, + "grad_norm": 8.481385406459447, + "learning_rate": 3.616795973190442e-06, + "logits/chosen": -0.6780751943588257, + "logits/rejected": -1.5924546718597412, + "logps/chosen": -0.7332156896591187, + "logps/rejected": -1.6106353998184204, + "loss": 0.8628, + "odds_ratio_loss": 0.2695372700691223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03666078671813011, + "rewards/margins": 0.04387098178267479, + "rewards/rejected": -0.0805317685008049, + "sft_loss": 0.7332156896591187, + "step": 1565 + }, + { + "epoch": 1.256, + "grad_norm": 4.8716559657268625, + "learning_rate": 3.6063739030204226e-06, + "logits/chosen": -0.7873969078063965, + "logits/rejected": -1.142225980758667, + "logps/chosen": -0.7441651821136475, + "logps/rejected": -1.4604860544204712, + "loss": 0.7917, + "odds_ratio_loss": 0.44381627440452576, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03720825910568237, + "rewards/margins": 0.03581603989005089, + "rewards/rejected": -0.07302430272102356, + "sft_loss": 0.7441651821136475, + "step": 1570 + }, + { + "epoch": 1.26, + "grad_norm": 5.809355093226559, + "learning_rate": 3.595927866972694e-06, + "logits/chosen": -0.8129979968070984, + "logits/rejected": -0.8149601817131042, + "logps/chosen": -0.7339099645614624, + "logps/rejected": -1.4441230297088623, + "loss": 0.8216, + "odds_ratio_loss": 0.3208310604095459, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03669549524784088, + "rewards/margins": 0.035510655492544174, + "rewards/rejected": -0.07220615446567535, + "sft_loss": 0.7339099645614624, + "step": 1575 + }, + { + "epoch": 1.264, + "grad_norm": 6.200919368514555, + "learning_rate": 3.5854580913255706e-06, + "logits/chosen": -0.5802955627441406, + "logits/rejected": -0.772422194480896, + "logps/chosen": -1.0532017946243286, + "logps/rejected": -1.506858468055725, + "loss": 0.8307, + "odds_ratio_loss": 0.4608602523803711, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05266008898615837, + "rewards/margins": 0.022682830691337585, + "rewards/rejected": -0.07534292340278625, + "sft_loss": 1.0532017946243286, + "step": 1580 + }, + { + "epoch": 1.268, + "grad_norm": 5.8218405882663085, + "learning_rate": 3.574964802871607e-06, + "logits/chosen": -0.5340670347213745, + "logits/rejected": -1.2797355651855469, + "logps/chosen": -0.8620258569717407, + "logps/rejected": -1.4560054540634155, + "loss": 0.8596, + "odds_ratio_loss": 0.3901941776275635, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.043101292103528976, + "rewards/margins": 0.029698973521590233, + "rewards/rejected": -0.07280026376247406, + "sft_loss": 0.8620258569717407, + "step": 1585 + }, + { + "epoch": 1.272, + "grad_norm": 6.664692745829687, + "learning_rate": 3.564448228912682e-06, + "logits/chosen": -0.5848423838615417, + "logits/rejected": -0.9934035539627075, + "logps/chosen": -0.7198655009269714, + "logps/rejected": -1.534895896911621, + "loss": 0.6926, + "odds_ratio_loss": 0.38178950548171997, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03599327430129051, + "rewards/margins": 0.04075152799487114, + "rewards/rejected": -0.07674480974674225, + "sft_loss": 0.7198655009269714, + "step": 1590 + }, + { + "epoch": 1.276, + "grad_norm": 6.854701270720263, + "learning_rate": 3.5539085972550786e-06, + "logits/chosen": -0.5101417303085327, + "logits/rejected": -1.0358482599258423, + "logps/chosen": -0.7137448191642761, + "logps/rejected": -1.2877639532089233, + "loss": 0.7468, + "odds_ratio_loss": 0.388786256313324, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.035687245428562164, + "rewards/margins": 0.02870095707476139, + "rewards/rejected": -0.0643882006406784, + "sft_loss": 0.7137448191642761, + "step": 1595 + }, + { + "epoch": 1.28, + "grad_norm": 6.779174368552459, + "learning_rate": 3.543346136204545e-06, + "logits/chosen": -0.9765304327011108, + "logits/rejected": -0.6466418504714966, + "logps/chosen": -0.9283822178840637, + "logps/rejected": -1.2504222393035889, + "loss": 0.8717, + "odds_ratio_loss": 0.5800614953041077, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04641910642385483, + "rewards/margins": 0.016102004796266556, + "rewards/rejected": -0.06252111494541168, + "sft_loss": 0.9283822178840637, + "step": 1600 + }, + { + "epoch": 1.284, + "grad_norm": 9.786348784846165, + "learning_rate": 3.532761074561355e-06, + "logits/chosen": -0.6300225853919983, + "logits/rejected": -1.1493359804153442, + "logps/chosen": -1.0870106220245361, + "logps/rejected": -1.61102294921875, + "loss": 0.801, + "odds_ratio_loss": 0.4377533793449402, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.054350532591342926, + "rewards/margins": 0.026200611144304276, + "rewards/rejected": -0.0805511474609375, + "sft_loss": 1.0870106220245361, + "step": 1605 + }, + { + "epoch": 1.288, + "grad_norm": 5.913885782177144, + "learning_rate": 3.522153641615345e-06, + "logits/chosen": -0.5452396273612976, + "logits/rejected": -1.095402479171753, + "logps/chosen": -0.9149934649467468, + "logps/rejected": -1.1826355457305908, + "loss": 0.9447, + "odds_ratio_loss": 0.5239506959915161, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04574967175722122, + "rewards/margins": 0.01338210143148899, + "rewards/rejected": -0.05913177877664566, + "sft_loss": 0.9149934649467468, + "step": 1610 + }, + { + "epoch": 1.292, + "grad_norm": 5.896924307562007, + "learning_rate": 3.5115240671409534e-06, + "logits/chosen": -0.6529924273490906, + "logits/rejected": -0.7851907014846802, + "logps/chosen": -0.8748595118522644, + "logps/rejected": -1.2012172937393188, + "loss": 0.8603, + "odds_ratio_loss": 0.5565796494483948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04374297708272934, + "rewards/margins": 0.016317885369062424, + "rewards/rejected": -0.060060858726501465, + "sft_loss": 0.8748595118522644, + "step": 1615 + }, + { + "epoch": 1.296, + "grad_norm": 9.095017819397759, + "learning_rate": 3.5008725813922383e-06, + "logits/chosen": -0.5774226188659668, + "logits/rejected": -1.0028631687164307, + "logps/chosen": -1.284346342086792, + "logps/rejected": -1.4374910593032837, + "loss": 0.8847, + "odds_ratio_loss": 0.6303257346153259, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06421732157468796, + "rewards/margins": 0.007657224778085947, + "rewards/rejected": -0.07187455147504807, + "sft_loss": 1.284346342086792, + "step": 1620 + }, + { + "epoch": 1.3, + "grad_norm": 9.33528794237972, + "learning_rate": 3.4901994150978926e-06, + "logits/chosen": -0.878362774848938, + "logits/rejected": -0.7265375256538391, + "logps/chosen": -0.8903997540473938, + "logps/rejected": -1.3938519954681396, + "loss": 0.8503, + "odds_ratio_loss": 0.45942550897598267, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04451999068260193, + "rewards/margins": 0.025172609835863113, + "rewards/rejected": -0.06969259679317474, + "sft_loss": 0.8903997540473938, + "step": 1625 + }, + { + "epoch": 1.304, + "grad_norm": 10.158405753265898, + "learning_rate": 3.4795047994562463e-06, + "logits/chosen": -0.8939259648323059, + "logits/rejected": -0.750242292881012, + "logps/chosen": -0.9898282885551453, + "logps/rejected": -1.685683250427246, + "loss": 0.8473, + "odds_ratio_loss": 0.5104387402534485, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.049491409212350845, + "rewards/margins": 0.03479275852441788, + "rewards/rejected": -0.08428415656089783, + "sft_loss": 0.9898282885551453, + "step": 1630 + }, + { + "epoch": 1.308, + "grad_norm": 5.954588970837507, + "learning_rate": 3.4687889661302577e-06, + "logits/chosen": -0.9046002626419067, + "logits/rejected": -0.9883207082748413, + "logps/chosen": -0.8091312646865845, + "logps/rejected": -1.3389207124710083, + "loss": 0.8038, + "odds_ratio_loss": 0.5021311640739441, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.040456559509038925, + "rewards/margins": 0.02648947760462761, + "rewards/rejected": -0.06694603711366653, + "sft_loss": 0.8091312646865845, + "step": 1635 + }, + { + "epoch": 1.312, + "grad_norm": 6.811003884561586, + "learning_rate": 3.458052147242494e-06, + "logits/chosen": -0.7402358651161194, + "logits/rejected": -0.9504525065422058, + "logps/chosen": -0.9438158273696899, + "logps/rejected": -2.0763096809387207, + "loss": 0.8385, + "odds_ratio_loss": 0.28224462270736694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.047190792858600616, + "rewards/margins": 0.05662469193339348, + "rewards/rejected": -0.1038154810667038, + "sft_loss": 0.9438158273696899, + "step": 1640 + }, + { + "epoch": 1.316, + "grad_norm": 8.65594926906311, + "learning_rate": 3.4472945753701038e-06, + "logits/chosen": -0.7355834245681763, + "logits/rejected": -0.7831618189811707, + "logps/chosen": -0.8326643705368042, + "logps/rejected": -1.457876205444336, + "loss": 0.8597, + "odds_ratio_loss": 0.43533915281295776, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04163322225213051, + "rewards/margins": 0.031260598450899124, + "rewards/rejected": -0.07289381325244904, + "sft_loss": 0.8326643705368042, + "step": 1645 + }, + { + "epoch": 1.32, + "grad_norm": 5.787660518955911, + "learning_rate": 3.436516483539781e-06, + "logits/chosen": -0.6674588918685913, + "logits/rejected": -1.381403923034668, + "logps/chosen": -0.6492362022399902, + "logps/rejected": -1.1699539422988892, + "loss": 0.7273, + "odds_ratio_loss": 0.35149624943733215, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03246181458234787, + "rewards/margins": 0.026035884395241737, + "rewards/rejected": -0.05849768966436386, + "sft_loss": 0.6492362022399902, + "step": 1650 + }, + { + "epoch": 1.324, + "grad_norm": 6.447720144022753, + "learning_rate": 3.4257181052227133e-06, + "logits/chosen": -0.76915043592453, + "logits/rejected": -1.1846320629119873, + "logps/chosen": -0.8949073553085327, + "logps/rejected": -1.5921183824539185, + "loss": 0.8674, + "odds_ratio_loss": 0.47409844398498535, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04474537447094917, + "rewards/margins": 0.03486054390668869, + "rewards/rejected": -0.07960591465234756, + "sft_loss": 0.8949073553085327, + "step": 1655 + }, + { + "epoch": 1.328, + "grad_norm": 16.496650507435916, + "learning_rate": 3.4148996743295305e-06, + "logits/chosen": -0.8497360348701477, + "logits/rejected": -1.0253467559814453, + "logps/chosen": -0.7608711123466492, + "logps/rejected": -1.4417269229888916, + "loss": 0.7469, + "odds_ratio_loss": 0.33891811966896057, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.038043562322854996, + "rewards/margins": 0.034042783081531525, + "rewards/rejected": -0.07208634912967682, + "sft_loss": 0.7608711123466492, + "step": 1660 + }, + { + "epoch": 1.332, + "grad_norm": 12.21871272581458, + "learning_rate": 3.4040614252052305e-06, + "logits/chosen": -0.7546567916870117, + "logits/rejected": -1.1986757516860962, + "logps/chosen": -0.6745157241821289, + "logps/rejected": -1.5598911046981812, + "loss": 0.7454, + "odds_ratio_loss": 0.3794470429420471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03372578322887421, + "rewards/margins": 0.04426876828074455, + "rewards/rejected": -0.07799455523490906, + "sft_loss": 0.6745157241821289, + "step": 1665 + }, + { + "epoch": 1.336, + "grad_norm": 5.398708828190264, + "learning_rate": 3.3932035926241103e-06, + "logits/chosen": -0.9119020700454712, + "logits/rejected": -0.9367231130599976, + "logps/chosen": -0.8278735280036926, + "logps/rejected": -1.6131706237792969, + "loss": 0.7558, + "odds_ratio_loss": 0.4295481741428375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04139367491006851, + "rewards/margins": 0.03926485404372215, + "rewards/rejected": -0.08065854012966156, + "sft_loss": 0.8278735280036926, + "step": 1670 + }, + { + "epoch": 1.34, + "grad_norm": 7.998094668850599, + "learning_rate": 3.3823264117846722e-06, + "logits/chosen": -0.7095149159431458, + "logits/rejected": -0.9372469186782837, + "logps/chosen": -0.6504486799240112, + "logps/rejected": -1.4709304571151733, + "loss": 0.7999, + "odds_ratio_loss": 0.3226034343242645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03252243250608444, + "rewards/margins": 0.041024088859558105, + "rewards/rejected": -0.07354652881622314, + "sft_loss": 0.6504486799240112, + "step": 1675 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 13.402324415016235, + "learning_rate": 3.3714301183045382e-06, + "logits/chosen": -1.0852770805358887, + "logits/rejected": -1.0294393301010132, + "logps/chosen": -0.6565207242965698, + "logps/rejected": -2.4157521724700928, + "loss": 0.835, + "odds_ratio_loss": 0.23401963710784912, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03282603248953819, + "rewards/margins": 0.0879615768790245, + "rewards/rejected": -0.1207876056432724, + "sft_loss": 0.6565207242965698, + "step": 1680 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 6.178693388099064, + "learning_rate": 3.360514948215339e-06, + "logits/chosen": -0.9066254496574402, + "logits/rejected": -1.067453384399414, + "logps/chosen": -0.5945879817008972, + "logps/rejected": -1.1769797801971436, + "loss": 0.7498, + "odds_ratio_loss": 0.4167722761631012, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02972939983010292, + "rewards/margins": 0.029119592159986496, + "rewards/rejected": -0.05884898826479912, + "sft_loss": 0.5945879817008972, + "step": 1685 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 8.662276656949278, + "learning_rate": 3.349581137957604e-06, + "logits/chosen": -0.7470632791519165, + "logits/rejected": -1.1178592443466187, + "logps/chosen": -0.7460489869117737, + "logps/rejected": -1.3396974802017212, + "loss": 0.7988, + "odds_ratio_loss": 0.39526912569999695, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.037302445620298386, + "rewards/margins": 0.029682422056794167, + "rewards/rejected": -0.0669848695397377, + "sft_loss": 0.7460489869117737, + "step": 1690 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 5.759001631003996, + "learning_rate": 3.338628924375638e-06, + "logits/chosen": -0.5028613805770874, + "logits/rejected": -0.9039660692214966, + "logps/chosen": -0.7187774181365967, + "logps/rejected": -1.8051927089691162, + "loss": 0.7071, + "odds_ratio_loss": 0.32172414660453796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03593887761235237, + "rewards/margins": 0.05432076379656792, + "rewards/rejected": -0.09025964140892029, + "sft_loss": 0.7187774181365967, + "step": 1695 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 10.588167123809765, + "learning_rate": 3.3276585447123957e-06, + "logits/chosen": -1.002184510231018, + "logits/rejected": -1.065515160560608, + "logps/chosen": -0.9078339338302612, + "logps/rejected": -1.463555097579956, + "loss": 0.829, + "odds_ratio_loss": 0.39932265877723694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04539170488715172, + "rewards/margins": 0.02778605744242668, + "rewards/rejected": -0.0731777623295784, + "sft_loss": 0.9078339338302612, + "step": 1700 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 6.065572924023139, + "learning_rate": 3.3166702366043364e-06, + "logits/chosen": -0.9326748847961426, + "logits/rejected": -1.246891736984253, + "logps/chosen": -0.949301540851593, + "logps/rejected": -1.7571815252304077, + "loss": 0.8751, + "odds_ratio_loss": 0.5060352087020874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04746507853269577, + "rewards/margins": 0.040393996983766556, + "rewards/rejected": -0.08785907924175262, + "sft_loss": 0.949301540851593, + "step": 1705 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 7.242528374663146, + "learning_rate": 3.3056642380762783e-06, + "logits/chosen": -0.7751462459564209, + "logits/rejected": -0.972857654094696, + "logps/chosen": -1.051584243774414, + "logps/rejected": -1.275604486465454, + "loss": 0.8721, + "odds_ratio_loss": 0.57765793800354, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05257921293377876, + "rewards/margins": 0.011201009154319763, + "rewards/rejected": -0.06378022581338882, + "sft_loss": 1.051584243774414, + "step": 1710 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 5.790819655822902, + "learning_rate": 3.294640787536245e-06, + "logits/chosen": -0.8153125643730164, + "logits/rejected": -1.0614861249923706, + "logps/chosen": -0.6195241212844849, + "logps/rejected": -1.4311720132827759, + "loss": 0.8206, + "odds_ratio_loss": 0.3184904158115387, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03097620978951454, + "rewards/margins": 0.040582384914159775, + "rewards/rejected": -0.07155859470367432, + "sft_loss": 0.6195241212844849, + "step": 1715 + }, + { + "epoch": 1.376, + "grad_norm": 11.980247948452748, + "learning_rate": 3.2836001237702993e-06, + "logits/chosen": -1.001387596130371, + "logits/rejected": -1.2363550662994385, + "logps/chosen": -0.6902490854263306, + "logps/rejected": -1.1932241916656494, + "loss": 0.8108, + "odds_ratio_loss": 0.368974506855011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034512460231781006, + "rewards/margins": 0.025148753076791763, + "rewards/rejected": -0.05966120958328247, + "sft_loss": 0.6902490854263306, + "step": 1720 + }, + { + "epoch": 1.38, + "grad_norm": 5.0828283091407735, + "learning_rate": 3.272542485937369e-06, + "logits/chosen": -1.0046998262405396, + "logits/rejected": -0.8059650659561157, + "logps/chosen": -0.7130595445632935, + "logps/rejected": -1.1542508602142334, + "loss": 0.8238, + "odds_ratio_loss": 0.4597712457180023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03565298020839691, + "rewards/margins": 0.02205955982208252, + "rewards/rejected": -0.05771253630518913, + "sft_loss": 0.7130595445632935, + "step": 1725 + }, + { + "epoch": 1.384, + "grad_norm": 5.35699654066337, + "learning_rate": 3.2614681135640696e-06, + "logits/chosen": -0.863267719745636, + "logits/rejected": -1.239708423614502, + "logps/chosen": -0.877366840839386, + "logps/rejected": -1.5349080562591553, + "loss": 0.805, + "odds_ratio_loss": 0.3669111132621765, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04386834427714348, + "rewards/margins": 0.03287705034017563, + "rewards/rejected": -0.07674539089202881, + "sft_loss": 0.877366840839386, + "step": 1730 + }, + { + "epoch": 1.388, + "grad_norm": 7.300604070779147, + "learning_rate": 3.2503772465395143e-06, + "logits/chosen": -0.8804348111152649, + "logits/rejected": -0.6921446323394775, + "logps/chosen": -0.6613166332244873, + "logps/rejected": -1.4078587293624878, + "loss": 0.7645, + "odds_ratio_loss": 0.3147312104701996, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.033065833151340485, + "rewards/margins": 0.037327107042074203, + "rewards/rejected": -0.07039294391870499, + "sft_loss": 0.6613166332244873, + "step": 1735 + }, + { + "epoch": 1.392, + "grad_norm": 7.544399455719484, + "learning_rate": 3.2392701251101172e-06, + "logits/chosen": -0.749683678150177, + "logits/rejected": -1.0036853551864624, + "logps/chosen": -0.8362616300582886, + "logps/rejected": -1.4140605926513672, + "loss": 0.8505, + "odds_ratio_loss": 0.3677152693271637, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04181307926774025, + "rewards/margins": 0.028889944776892662, + "rewards/rejected": -0.07070302218198776, + "sft_loss": 0.8362616300582886, + "step": 1740 + }, + { + "epoch": 1.396, + "grad_norm": 5.220293895574105, + "learning_rate": 3.228146989874389e-06, + "logits/chosen": -0.8896152377128601, + "logits/rejected": -0.9276788830757141, + "logps/chosen": -0.9979864358901978, + "logps/rejected": -1.3124749660491943, + "loss": 0.9121, + "odds_ratio_loss": 0.5718216896057129, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.049899328500032425, + "rewards/margins": 0.015724416822195053, + "rewards/rejected": -0.06562374532222748, + "sft_loss": 0.9979864358901978, + "step": 1745 + }, + { + "epoch": 1.4, + "grad_norm": 8.96935863952734, + "learning_rate": 3.217008081777726e-06, + "logits/chosen": -0.8349775075912476, + "logits/rejected": -0.8859192132949829, + "logps/chosen": -0.7498109340667725, + "logps/rejected": -1.4023478031158447, + "loss": 0.7615, + "odds_ratio_loss": 0.3978427052497864, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03749055042862892, + "rewards/margins": 0.03262684494256973, + "rewards/rejected": -0.07011739164590836, + "sft_loss": 0.7498109340667725, + "step": 1750 + }, + { + "epoch": 1.404, + "grad_norm": 4.812654034935553, + "learning_rate": 3.205853642107192e-06, + "logits/chosen": -0.8215142488479614, + "logits/rejected": -0.8328277468681335, + "logps/chosen": -0.7706052660942078, + "logps/rejected": -1.4792053699493408, + "loss": 0.7118, + "odds_ratio_loss": 0.3464731276035309, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.038530267775058746, + "rewards/margins": 0.03542999550700188, + "rewards/rejected": -0.07396025955677032, + "sft_loss": 0.7706052660942078, + "step": 1755 + }, + { + "epoch": 1.408, + "grad_norm": 5.327583776041735, + "learning_rate": 3.1946839124862873e-06, + "logits/chosen": -0.8214722871780396, + "logits/rejected": -0.9568880796432495, + "logps/chosen": -0.8749944567680359, + "logps/rejected": -1.4335380792617798, + "loss": 0.8792, + "odds_ratio_loss": 0.47741952538490295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04374972730875015, + "rewards/margins": 0.027927175164222717, + "rewards/rejected": -0.07167690247297287, + "sft_loss": 0.8749944567680359, + "step": 1760 + }, + { + "epoch": 1.412, + "grad_norm": 7.208526674502584, + "learning_rate": 3.183499134869721e-06, + "logits/chosen": -0.8680820465087891, + "logits/rejected": -0.9875283241271973, + "logps/chosen": -1.09853196144104, + "logps/rejected": -1.329644799232483, + "loss": 0.8253, + "odds_ratio_loss": 0.6291381120681763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05492659658193588, + "rewards/margins": 0.01155565120279789, + "rewards/rejected": -0.06648223847150803, + "sft_loss": 1.09853196144104, + "step": 1765 + }, + { + "epoch": 1.416, + "grad_norm": 12.79259892670254, + "learning_rate": 3.1722995515381644e-06, + "logits/chosen": -1.3741729259490967, + "logits/rejected": -0.9874537587165833, + "logps/chosen": -0.6389180421829224, + "logps/rejected": -1.135964035987854, + "loss": 0.8255, + "odds_ratio_loss": 0.34384432435035706, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.031945906579494476, + "rewards/margins": 0.024852296337485313, + "rewards/rejected": -0.05679820105433464, + "sft_loss": 0.6389180421829224, + "step": 1770 + }, + { + "epoch": 1.42, + "grad_norm": 8.588985311767232, + "learning_rate": 3.1610854050930063e-06, + "logits/chosen": -0.8712812662124634, + "logits/rejected": -1.1689540147781372, + "logps/chosen": -1.0062519311904907, + "logps/rejected": -1.973914384841919, + "loss": 0.9165, + "odds_ratio_loss": 0.35945457220077515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.050312597304582596, + "rewards/margins": 0.04838312417268753, + "rewards/rejected": -0.09869572520256042, + "sft_loss": 1.0062519311904907, + "step": 1775 + }, + { + "epoch": 1.424, + "grad_norm": 5.412987589877953, + "learning_rate": 3.149856938451094e-06, + "logits/chosen": -0.5333024263381958, + "logits/rejected": -1.3137054443359375, + "logps/chosen": -0.8601778149604797, + "logps/rejected": -1.641958475112915, + "loss": 0.888, + "odds_ratio_loss": 0.7197853922843933, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.043008893728256226, + "rewards/margins": 0.039089031517505646, + "rewards/rejected": -0.08209791779518127, + "sft_loss": 0.8601778149604797, + "step": 1780 + }, + { + "epoch": 1.428, + "grad_norm": 6.165182680586835, + "learning_rate": 3.1386143948394764e-06, + "logits/chosen": -1.0411813259124756, + "logits/rejected": -0.48357734084129333, + "logps/chosen": -0.7215684652328491, + "logps/rejected": -1.2277052402496338, + "loss": 0.774, + "odds_ratio_loss": 0.4655603766441345, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03607841581106186, + "rewards/margins": 0.02530684694647789, + "rewards/rejected": -0.06138526648283005, + "sft_loss": 0.7215684652328491, + "step": 1785 + }, + { + "epoch": 1.432, + "grad_norm": 5.856667562992677, + "learning_rate": 3.127358017790132e-06, + "logits/chosen": -1.1004786491394043, + "logits/rejected": -0.8851817846298218, + "logps/chosen": -0.5895828008651733, + "logps/rejected": -1.2631773948669434, + "loss": 0.7728, + "odds_ratio_loss": 0.329679012298584, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.029479142278432846, + "rewards/margins": 0.03367973119020462, + "rewards/rejected": -0.06315887719392776, + "sft_loss": 0.5895828008651733, + "step": 1790 + }, + { + "epoch": 1.436, + "grad_norm": 5.146001537629413, + "learning_rate": 3.116088051134695e-06, + "logits/chosen": -1.0093369483947754, + "logits/rejected": -1.3544859886169434, + "logps/chosen": -0.7406253218650818, + "logps/rejected": -1.5723696947097778, + "loss": 0.8389, + "odds_ratio_loss": 0.31857210397720337, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03703127056360245, + "rewards/margins": 0.041587214916944504, + "rewards/rejected": -0.07861848175525665, + "sft_loss": 0.7406253218650818, + "step": 1795 + }, + { + "epoch": 1.44, + "grad_norm": 4.843927894063411, + "learning_rate": 3.1048047389991693e-06, + "logits/chosen": -0.903844952583313, + "logits/rejected": -0.8139872550964355, + "logps/chosen": -1.0251444578170776, + "logps/rejected": -1.2178661823272705, + "loss": 0.9374, + "odds_ratio_loss": 0.6262407302856445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05125723034143448, + "rewards/margins": 0.00963608454912901, + "rewards/rejected": -0.060893308371305466, + "sft_loss": 1.0251444578170776, + "step": 1800 + }, + { + "epoch": 1.444, + "grad_norm": 7.431201496879773, + "learning_rate": 3.0935083257986493e-06, + "logits/chosen": -0.6094223260879517, + "logits/rejected": -0.899307370185852, + "logps/chosen": -0.7733746767044067, + "logps/rejected": -1.2325997352600098, + "loss": 0.8234, + "odds_ratio_loss": 0.44773632287979126, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03866872936487198, + "rewards/margins": 0.022961260750889778, + "rewards/rejected": -0.06162998825311661, + "sft_loss": 0.7733746767044067, + "step": 1805 + }, + { + "epoch": 1.448, + "grad_norm": 11.586614332572152, + "learning_rate": 3.082199056232015e-06, + "logits/chosen": -0.7563143372535706, + "logits/rejected": -1.069698452949524, + "logps/chosen": -1.023844599723816, + "logps/rejected": -1.5895153284072876, + "loss": 0.7528, + "odds_ratio_loss": 0.4808468222618103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.051192231476306915, + "rewards/margins": 0.028283540159463882, + "rewards/rejected": -0.0794757753610611, + "sft_loss": 1.023844599723816, + "step": 1810 + }, + { + "epoch": 1.452, + "grad_norm": 8.9164649839478, + "learning_rate": 3.0708771752766397e-06, + "logits/chosen": -1.1078320741653442, + "logits/rejected": -1.0441960096359253, + "logps/chosen": -0.8418495059013367, + "logps/rejected": -1.273721694946289, + "loss": 0.8566, + "odds_ratio_loss": 0.4950701594352722, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04209247976541519, + "rewards/margins": 0.0215936116874218, + "rewards/rejected": -0.06368608772754669, + "sft_loss": 0.8418495059013367, + "step": 1815 + }, + { + "epoch": 1.456, + "grad_norm": 5.634450444011399, + "learning_rate": 3.059542928183079e-06, + "logits/chosen": -1.020975112915039, + "logits/rejected": -1.0432868003845215, + "logps/chosen": -0.8400118947029114, + "logps/rejected": -1.9318656921386719, + "loss": 0.8575, + "odds_ratio_loss": 0.45713549852371216, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04200059548020363, + "rewards/margins": 0.05459269881248474, + "rewards/rejected": -0.09659329801797867, + "sft_loss": 0.8400118947029114, + "step": 1820 + }, + { + "epoch": 1.46, + "grad_norm": 10.1775941550842, + "learning_rate": 3.0481965604697582e-06, + "logits/chosen": -0.6371676325798035, + "logits/rejected": -1.1217305660247803, + "logps/chosen": -0.7022415995597839, + "logps/rejected": -1.5772149562835693, + "loss": 0.7258, + "odds_ratio_loss": 0.3305204510688782, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.035112082958221436, + "rewards/margins": 0.04374866932630539, + "rewards/rejected": -0.07886074483394623, + "sft_loss": 0.7022415995597839, + "step": 1825 + }, + { + "epoch": 1.464, + "grad_norm": 6.898491609991041, + "learning_rate": 3.0368383179176584e-06, + "logits/chosen": -0.5923871994018555, + "logits/rejected": -1.568169355392456, + "logps/chosen": -0.7653383016586304, + "logps/rejected": -1.7148897647857666, + "loss": 0.7246, + "odds_ratio_loss": 0.36337292194366455, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03826691955327988, + "rewards/margins": 0.04747757315635681, + "rewards/rejected": -0.08574448525905609, + "sft_loss": 0.7653383016586304, + "step": 1830 + }, + { + "epoch": 1.468, + "grad_norm": 5.964977560470005, + "learning_rate": 3.025468446564985e-06, + "logits/chosen": -1.165166974067688, + "logits/rejected": -0.8749963641166687, + "logps/chosen": -0.8071148991584778, + "logps/rejected": -1.657152533531189, + "loss": 0.865, + "odds_ratio_loss": 0.321114182472229, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04035574942827225, + "rewards/margins": 0.04250188171863556, + "rewards/rejected": -0.08285762369632721, + "sft_loss": 0.8071148991584778, + "step": 1835 + }, + { + "epoch": 1.472, + "grad_norm": 9.069724398190369, + "learning_rate": 3.0140871927018466e-06, + "logits/chosen": -1.0001251697540283, + "logits/rejected": -0.8858006596565247, + "logps/chosen": -0.8963924646377563, + "logps/rejected": -1.671795129776001, + "loss": 0.9639, + "odds_ratio_loss": 0.41972383856773376, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04481962323188782, + "rewards/margins": 0.03877013176679611, + "rewards/rejected": -0.08358974754810333, + "sft_loss": 0.8963924646377563, + "step": 1840 + }, + { + "epoch": 1.476, + "grad_norm": 8.923998975352527, + "learning_rate": 3.002694802864912e-06, + "logits/chosen": -0.8372132182121277, + "logits/rejected": -0.897018551826477, + "logps/chosen": -0.6927198171615601, + "logps/rejected": -1.5746911764144897, + "loss": 0.7381, + "odds_ratio_loss": 0.30051150918006897, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.034635990858078, + "rewards/margins": 0.044098567217588425, + "rewards/rejected": -0.07873455435037613, + "sft_loss": 0.6927198171615601, + "step": 1845 + }, + { + "epoch": 1.48, + "grad_norm": 11.770053955264393, + "learning_rate": 2.9912915238320755e-06, + "logits/chosen": -0.6851986646652222, + "logits/rejected": -1.1784414052963257, + "logps/chosen": -0.833079993724823, + "logps/rejected": -1.4270697832107544, + "loss": 0.8667, + "odds_ratio_loss": 0.42909732460975647, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04165399819612503, + "rewards/margins": 0.02969948947429657, + "rewards/rejected": -0.0713534951210022, + "sft_loss": 0.833079993724823, + "step": 1850 + }, + { + "epoch": 1.484, + "grad_norm": 6.233141067464782, + "learning_rate": 2.9798776026171087e-06, + "logits/chosen": -0.842139720916748, + "logits/rejected": -1.1131788492202759, + "logps/chosen": -0.9360324740409851, + "logps/rejected": -1.4683505296707153, + "loss": 0.8339, + "odds_ratio_loss": 0.5007175207138062, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04680163040757179, + "rewards/margins": 0.026615899056196213, + "rewards/rejected": -0.07341752201318741, + "sft_loss": 0.9360324740409851, + "step": 1855 + }, + { + "epoch": 1.488, + "grad_norm": 6.003031745272665, + "learning_rate": 2.9684532864643123e-06, + "logits/chosen": -0.8800287246704102, + "logits/rejected": -1.1693894863128662, + "logps/chosen": -0.8431293368339539, + "logps/rejected": -1.1915092468261719, + "loss": 0.8388, + "odds_ratio_loss": 0.5465279817581177, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04215647280216217, + "rewards/margins": 0.017418991774320602, + "rewards/rejected": -0.059575460851192474, + "sft_loss": 0.8431293368339539, + "step": 1860 + }, + { + "epoch": 1.492, + "grad_norm": 4.835110290839885, + "learning_rate": 2.957018822843154e-06, + "logits/chosen": -0.8184400796890259, + "logits/rejected": -1.3122928142547607, + "logps/chosen": -0.6195321679115295, + "logps/rejected": -1.447097897529602, + "loss": 0.8414, + "odds_ratio_loss": 0.3124271035194397, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030976608395576477, + "rewards/margins": 0.04137829318642616, + "rewards/rejected": -0.07235489785671234, + "sft_loss": 0.6195321679115295, + "step": 1865 + }, + { + "epoch": 1.496, + "grad_norm": 6.054406343408761, + "learning_rate": 2.945574459442917e-06, + "logits/chosen": -0.9596401453018188, + "logits/rejected": -1.1060923337936401, + "logps/chosen": -0.6389337778091431, + "logps/rejected": -1.4341099262237549, + "loss": 0.8757, + "odds_ratio_loss": 0.32751792669296265, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03194668889045715, + "rewards/margins": 0.03975880891084671, + "rewards/rejected": -0.07170549780130386, + "sft_loss": 0.6389337778091431, + "step": 1870 + }, + { + "epoch": 1.5, + "grad_norm": 5.445524941655018, + "learning_rate": 2.9341204441673267e-06, + "logits/chosen": -0.647433876991272, + "logits/rejected": -1.0591230392456055, + "logps/chosen": -0.6938437223434448, + "logps/rejected": -1.3513716459274292, + "loss": 0.7662, + "odds_ratio_loss": 0.5081362128257751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0346921868622303, + "rewards/margins": 0.0328763946890831, + "rewards/rejected": -0.0675685852766037, + "sft_loss": 0.6938437223434448, + "step": 1875 + }, + { + "epoch": 1.504, + "grad_norm": 5.6546570414563, + "learning_rate": 2.922657025129185e-06, + "logits/chosen": -0.6220548748970032, + "logits/rejected": -0.8203363418579102, + "logps/chosen": -0.8300703167915344, + "logps/rejected": -1.3827760219573975, + "loss": 0.7737, + "odds_ratio_loss": 0.40472111105918884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04150351509451866, + "rewards/margins": 0.02763528749346733, + "rewards/rejected": -0.06913881003856659, + "sft_loss": 0.8300703167915344, + "step": 1880 + }, + { + "epoch": 1.508, + "grad_norm": 6.230171438098299, + "learning_rate": 2.9111844506449973e-06, + "logits/chosen": -0.7947267293930054, + "logits/rejected": -0.9040530920028687, + "logps/chosen": -0.9311960935592651, + "logps/rejected": -1.4013652801513672, + "loss": 0.7731, + "odds_ratio_loss": 0.44902342557907104, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04655980318784714, + "rewards/margins": 0.02350846491754055, + "rewards/rejected": -0.07006827741861343, + "sft_loss": 0.9311960935592651, + "step": 1885 + }, + { + "epoch": 1.512, + "grad_norm": 6.7332266058939405, + "learning_rate": 2.8997029692295875e-06, + "logits/chosen": -0.814238429069519, + "logits/rejected": -0.9660609364509583, + "logps/chosen": -0.7938546538352966, + "logps/rejected": -1.457929253578186, + "loss": 0.9477, + "odds_ratio_loss": 0.4195871949195862, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03969273716211319, + "rewards/margins": 0.033203721046447754, + "rewards/rejected": -0.07289645820856094, + "sft_loss": 0.7938546538352966, + "step": 1890 + }, + { + "epoch": 1.516, + "grad_norm": 5.752938476673707, + "learning_rate": 2.888212829590719e-06, + "logits/chosen": -0.6543909311294556, + "logits/rejected": -0.5943178534507751, + "logps/chosen": -0.5880377888679504, + "logps/rejected": -1.3045456409454346, + "loss": 0.8214, + "odds_ratio_loss": 0.3315754234790802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02940189279615879, + "rewards/margins": 0.03582539036870003, + "rewards/rejected": -0.06522727757692337, + "sft_loss": 0.5880377888679504, + "step": 1895 + }, + { + "epoch": 1.52, + "grad_norm": 4.713391058679103, + "learning_rate": 2.876714280623708e-06, + "logits/chosen": -0.8652218580245972, + "logits/rejected": -0.9359322786331177, + "logps/chosen": -0.6846807599067688, + "logps/rejected": -1.5125911235809326, + "loss": 0.7733, + "odds_ratio_loss": 0.2784159481525421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03423403576016426, + "rewards/margins": 0.04139552637934685, + "rewards/rejected": -0.07562955468893051, + "sft_loss": 0.6846807599067688, + "step": 1900 + }, + { + "epoch": 1.524, + "grad_norm": 9.06130869139701, + "learning_rate": 2.8652075714060296e-06, + "logits/chosen": -0.920418381690979, + "logits/rejected": -0.9699466824531555, + "logps/chosen": -0.49238044023513794, + "logps/rejected": -1.3032811880111694, + "loss": 0.7794, + "odds_ratio_loss": 0.21321973204612732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024619024246931076, + "rewards/margins": 0.040545038878917694, + "rewards/rejected": -0.06516405940055847, + "sft_loss": 0.49238044023513794, + "step": 1905 + }, + { + "epoch": 1.528, + "grad_norm": 6.44994215725063, + "learning_rate": 2.8536929511919227e-06, + "logits/chosen": -0.5823496580123901, + "logits/rejected": -1.490532636642456, + "logps/chosen": -0.7626686096191406, + "logps/rejected": -1.4420548677444458, + "loss": 0.8321, + "odds_ratio_loss": 0.3727583587169647, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03813342750072479, + "rewards/margins": 0.03396930545568466, + "rewards/rejected": -0.07210274040699005, + "sft_loss": 0.7626686096191406, + "step": 1910 + }, + { + "epoch": 1.532, + "grad_norm": 10.576382626683463, + "learning_rate": 2.842170669406993e-06, + "logits/chosen": -0.9067608714103699, + "logits/rejected": -1.0328947305679321, + "logps/chosen": -0.7888859510421753, + "logps/rejected": -1.5981676578521729, + "loss": 0.9028, + "odds_ratio_loss": 0.3959670960903168, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.039444297552108765, + "rewards/margins": 0.04046408459544182, + "rewards/rejected": -0.07990838587284088, + "sft_loss": 0.7888859510421753, + "step": 1915 + }, + { + "epoch": 1.536, + "grad_norm": 10.142634875571975, + "learning_rate": 2.8306409756428067e-06, + "logits/chosen": -0.8129776120185852, + "logits/rejected": -1.2146799564361572, + "logps/chosen": -0.631020188331604, + "logps/rejected": -1.7902923822402954, + "loss": 0.7791, + "odds_ratio_loss": 0.4324275851249695, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03155101090669632, + "rewards/margins": 0.05796360969543457, + "rewards/rejected": -0.08951462805271149, + "sft_loss": 0.631020188331604, + "step": 1920 + }, + { + "epoch": 1.54, + "grad_norm": 6.955192952345903, + "learning_rate": 2.8191041196514874e-06, + "logits/chosen": -0.9502149820327759, + "logits/rejected": -1.0196278095245361, + "logps/chosen": -0.8367894291877747, + "logps/rejected": -1.5817101001739502, + "loss": 0.8259, + "odds_ratio_loss": 0.31859907507896423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04183947294950485, + "rewards/margins": 0.03724603354930878, + "rewards/rejected": -0.07908550649881363, + "sft_loss": 0.8367894291877747, + "step": 1925 + }, + { + "epoch": 1.544, + "grad_norm": 8.19557532330742, + "learning_rate": 2.807560351340302e-06, + "logits/chosen": -1.2878282070159912, + "logits/rejected": -0.9704602956771851, + "logps/chosen": -0.6887832880020142, + "logps/rejected": -1.2875627279281616, + "loss": 0.7609, + "odds_ratio_loss": 0.40261825919151306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.034439168870449066, + "rewards/margins": 0.029938969761133194, + "rewards/rejected": -0.06437813490629196, + "sft_loss": 0.6887832880020142, + "step": 1930 + }, + { + "epoch": 1.548, + "grad_norm": 11.200304687438543, + "learning_rate": 2.7960099207662535e-06, + "logits/chosen": -1.1493829488754272, + "logits/rejected": -1.0676811933517456, + "logps/chosen": -0.7964752316474915, + "logps/rejected": -1.2786619663238525, + "loss": 0.8397, + "odds_ratio_loss": 0.4415219724178314, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.039823759347200394, + "rewards/margins": 0.024109339341521263, + "rewards/rejected": -0.0639331042766571, + "sft_loss": 0.7964752316474915, + "step": 1935 + }, + { + "epoch": 1.552, + "grad_norm": 6.369713349556111, + "learning_rate": 2.7844530781306544e-06, + "logits/chosen": -0.8758190870285034, + "logits/rejected": -0.826004147529602, + "logps/chosen": -0.7934783697128296, + "logps/rejected": -1.5418189764022827, + "loss": 0.7721, + "odds_ratio_loss": 0.3249477446079254, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03967391699552536, + "rewards/margins": 0.037417031824588776, + "rewards/rejected": -0.07709096372127533, + "sft_loss": 0.7934783697128296, + "step": 1940 + }, + { + "epoch": 1.556, + "grad_norm": 7.211892711878723, + "learning_rate": 2.77289007377372e-06, + "logits/chosen": -0.8543888926506042, + "logits/rejected": -0.8026968240737915, + "logps/chosen": -0.5843006372451782, + "logps/rejected": -1.1217896938323975, + "loss": 0.8268, + "odds_ratio_loss": 0.388207346200943, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02921503409743309, + "rewards/margins": 0.02687445841729641, + "rewards/rejected": -0.05608949065208435, + "sft_loss": 0.5843006372451782, + "step": 1945 + }, + { + "epoch": 1.56, + "grad_norm": 6.68682091618046, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": -1.3163114786148071, + "logits/rejected": -0.9326937794685364, + "logps/chosen": -0.6405404210090637, + "logps/rejected": -1.8683983087539673, + "loss": 0.8001, + "odds_ratio_loss": 0.3494071960449219, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.032027024775743484, + "rewards/margins": 0.0613928958773613, + "rewards/rejected": -0.09341991692781448, + "sft_loss": 0.6405404210090637, + "step": 1950 + }, + { + "epoch": 1.564, + "grad_norm": 8.853441517495, + "learning_rate": 2.749746581918629e-06, + "logits/chosen": -0.7279099225997925, + "logits/rejected": -0.7794903516769409, + "logps/chosen": -0.6024842262268066, + "logps/rejected": -1.2369228601455688, + "loss": 0.8124, + "odds_ratio_loss": 0.40761059522628784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03012421354651451, + "rewards/margins": 0.031721919775009155, + "rewards/rejected": -0.061846137046813965, + "sft_loss": 0.6024842262268066, + "step": 1955 + }, + { + "epoch": 1.568, + "grad_norm": 4.771111743627134, + "learning_rate": 2.738166595746554e-06, + "logits/chosen": -0.6174139976501465, + "logits/rejected": -0.9298956990242004, + "logps/chosen": -0.8409628868103027, + "logps/rejected": -1.2658686637878418, + "loss": 0.8408, + "odds_ratio_loss": 0.46160784363746643, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0420481413602829, + "rewards/margins": 0.021245291456580162, + "rewards/rejected": -0.0632934421300888, + "sft_loss": 0.8409628868103027, + "step": 1960 + }, + { + "epoch": 1.572, + "grad_norm": 7.568042308319601, + "learning_rate": 2.726581450494451e-06, + "logits/chosen": -0.5073860883712769, + "logits/rejected": -0.8094170689582825, + "logps/chosen": -0.6778584718704224, + "logps/rejected": -1.0111925601959229, + "loss": 0.7613, + "odds_ratio_loss": 0.44990649819374084, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.033892922103405, + "rewards/margins": 0.016666702926158905, + "rewards/rejected": -0.050559621304273605, + "sft_loss": 0.6778584718704224, + "step": 1965 + }, + { + "epoch": 1.576, + "grad_norm": 9.06697099963702, + "learning_rate": 2.7149913971156105e-06, + "logits/chosen": -0.8449646234512329, + "logits/rejected": -1.0290577411651611, + "logps/chosen": -0.7261394262313843, + "logps/rejected": -1.0704646110534668, + "loss": 0.9347, + "odds_ratio_loss": 0.4873872697353363, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.036306969821453094, + "rewards/margins": 0.017216259613633156, + "rewards/rejected": -0.0535232312977314, + "sft_loss": 0.7261394262313843, + "step": 1970 + }, + { + "epoch": 1.58, + "grad_norm": 7.716541168169069, + "learning_rate": 2.703396686669646e-06, + "logits/chosen": -0.43536773324012756, + "logits/rejected": -1.230164647102356, + "logps/chosen": -0.9158234596252441, + "logps/rejected": -1.4675378799438477, + "loss": 0.8256, + "odds_ratio_loss": 0.4452730119228363, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.045791175216436386, + "rewards/margins": 0.027585718780755997, + "rewards/rejected": -0.07337689399719238, + "sft_loss": 0.9158234596252441, + "step": 1975 + }, + { + "epoch": 1.584, + "grad_norm": 11.344687962958782, + "learning_rate": 2.6917975703170466e-06, + "logits/chosen": -0.7123010158538818, + "logits/rejected": -1.2155797481536865, + "logps/chosen": -0.6988154649734497, + "logps/rejected": -1.039637565612793, + "loss": 0.7954, + "odds_ratio_loss": 0.4463338255882263, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.034940771758556366, + "rewards/margins": 0.017041107639670372, + "rewards/rejected": -0.05198187753558159, + "sft_loss": 0.6988154649734497, + "step": 1980 + }, + { + "epoch": 1.588, + "grad_norm": 7.131197917094538, + "learning_rate": 2.6801942993137435e-06, + "logits/chosen": -0.5410433411598206, + "logits/rejected": -1.1760857105255127, + "logps/chosen": -0.8232455253601074, + "logps/rejected": -1.0818697214126587, + "loss": 0.8893, + "odds_ratio_loss": 0.5539475679397583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04116227477788925, + "rewards/margins": 0.012931210920214653, + "rewards/rejected": -0.05409349128603935, + "sft_loss": 0.8232455253601074, + "step": 1985 + }, + { + "epoch": 1.592, + "grad_norm": 5.5964963412533555, + "learning_rate": 2.668587125005663e-06, + "logits/chosen": -0.8460969924926758, + "logits/rejected": -1.182850956916809, + "logps/chosen": -0.6967536807060242, + "logps/rejected": -1.3319151401519775, + "loss": 0.7849, + "odds_ratio_loss": 0.41814327239990234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03483767807483673, + "rewards/margins": 0.03175807744264603, + "rewards/rejected": -0.06659575551748276, + "sft_loss": 0.6967536807060242, + "step": 1990 + }, + { + "epoch": 1.596, + "grad_norm": 5.763026931052832, + "learning_rate": 2.6569762988232838e-06, + "logits/chosen": -0.8818701505661011, + "logits/rejected": -1.0999513864517212, + "logps/chosen": -0.7708557844161987, + "logps/rejected": -1.50910222530365, + "loss": 0.7027, + "odds_ratio_loss": 0.3464539647102356, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03854278475046158, + "rewards/margins": 0.03691232204437256, + "rewards/rejected": -0.07545509934425354, + "sft_loss": 0.7708557844161987, + "step": 1995 + }, + { + "epoch": 1.6, + "grad_norm": 4.594815757134377, + "learning_rate": 2.6453620722761897e-06, + "logits/chosen": -0.7889989018440247, + "logits/rejected": -1.0313599109649658, + "logps/chosen": -1.0150768756866455, + "logps/rejected": -1.3293017148971558, + "loss": 0.8486, + "odds_ratio_loss": 0.5205736756324768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.050753843039274216, + "rewards/margins": 0.01571125164628029, + "rewards/rejected": -0.0664650946855545, + "sft_loss": 1.0150768756866455, + "step": 2000 + }, + { + "epoch": 1.604, + "grad_norm": 5.622181491776131, + "learning_rate": 2.6337446969476234e-06, + "logits/chosen": -0.6143472790718079, + "logits/rejected": -0.8800684213638306, + "logps/chosen": -0.70451819896698, + "logps/rejected": -1.127333641052246, + "loss": 0.7694, + "odds_ratio_loss": 0.47300809621810913, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03522591292858124, + "rewards/margins": 0.021140772849321365, + "rewards/rejected": -0.056366682052612305, + "sft_loss": 0.70451819896698, + "step": 2005 + }, + { + "epoch": 1.608, + "grad_norm": 5.4411456874419635, + "learning_rate": 2.6221244244890336e-06, + "logits/chosen": -0.8750918507575989, + "logits/rejected": -1.0322843790054321, + "logps/chosen": -1.0601109266281128, + "logps/rejected": -1.424378514289856, + "loss": 0.8386, + "odds_ratio_loss": 0.5982974767684937, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.05300554633140564, + "rewards/margins": 0.018213381990790367, + "rewards/rejected": -0.07121893018484116, + "sft_loss": 1.0601109266281128, + "step": 2010 + }, + { + "epoch": 1.612, + "grad_norm": 6.189805248319622, + "learning_rate": 2.6105015066146266e-06, + "logits/chosen": -1.0919370651245117, + "logits/rejected": -0.9859377145767212, + "logps/chosen": -0.7465149164199829, + "logps/rejected": -1.1737520694732666, + "loss": 0.7624, + "odds_ratio_loss": 0.41314896941185, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.037325747311115265, + "rewards/margins": 0.021361857652664185, + "rewards/rejected": -0.05868760868906975, + "sft_loss": 0.7465149164199829, + "step": 2015 + }, + { + "epoch": 1.616, + "grad_norm": 9.164029729819344, + "learning_rate": 2.5988761950959133e-06, + "logits/chosen": -0.8967428207397461, + "logits/rejected": -0.6835768818855286, + "logps/chosen": -0.7614465951919556, + "logps/rejected": -1.4321725368499756, + "loss": 0.8553, + "odds_ratio_loss": 0.4023088812828064, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03807232901453972, + "rewards/margins": 0.03353629633784294, + "rewards/rejected": -0.07160861790180206, + "sft_loss": 0.7614465951919556, + "step": 2020 + }, + { + "epoch": 1.62, + "grad_norm": 8.085049537513227, + "learning_rate": 2.587248741756253e-06, + "logits/chosen": -0.7910897731781006, + "logits/rejected": -1.1293184757232666, + "logps/chosen": -0.7620183229446411, + "logps/rejected": -1.539175033569336, + "loss": 0.7589, + "odds_ratio_loss": 0.3549695909023285, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.038100920617580414, + "rewards/margins": 0.0388578325510025, + "rewards/rejected": -0.07695874571800232, + "sft_loss": 0.7620183229446411, + "step": 2025 + }, + { + "epoch": 1.624, + "grad_norm": 10.097540227811598, + "learning_rate": 2.575619398465402e-06, + "logits/chosen": -0.8627431988716125, + "logits/rejected": -0.9311240315437317, + "logps/chosen": -0.877997875213623, + "logps/rejected": -1.0316495895385742, + "loss": 0.8384, + "odds_ratio_loss": 0.5710136890411377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04389990121126175, + "rewards/margins": 0.007682584226131439, + "rewards/rejected": -0.05158247798681259, + "sft_loss": 0.877997875213623, + "step": 2030 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 7.090714107623907, + "learning_rate": 2.563988417134056e-06, + "logits/chosen": -0.9987133145332336, + "logits/rejected": -0.8781261444091797, + "logps/chosen": -0.5755537748336792, + "logps/rejected": -1.2042471170425415, + "loss": 0.8191, + "odds_ratio_loss": 0.31655603647232056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02877769246697426, + "rewards/margins": 0.03143466264009476, + "rewards/rejected": -0.06021235138177872, + "sft_loss": 0.5755537748336792, + "step": 2035 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 7.424256408913974, + "learning_rate": 2.5523560497083927e-06, + "logits/chosen": -0.9880864024162292, + "logits/rejected": -1.3933836221694946, + "logps/chosen": -0.6906462907791138, + "logps/rejected": -1.200685739517212, + "loss": 0.8062, + "odds_ratio_loss": 0.36662763357162476, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03453231602907181, + "rewards/margins": 0.025501970201730728, + "rewards/rejected": -0.060034286230802536, + "sft_loss": 0.6906462907791138, + "step": 2040 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 5.662873396280538, + "learning_rate": 2.5407225481646146e-06, + "logits/chosen": -0.638294517993927, + "logits/rejected": -0.9844516515731812, + "logps/chosen": -0.8093409538269043, + "logps/rejected": -1.5629360675811768, + "loss": 0.8729, + "odds_ratio_loss": 0.4065137505531311, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.040467046201229095, + "rewards/margins": 0.037679754197597504, + "rewards/rejected": -0.0781468003988266, + "sft_loss": 0.8093409538269043, + "step": 2045 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 4.8043102192548925, + "learning_rate": 2.5290881645034932e-06, + "logits/chosen": -0.5350710153579712, + "logits/rejected": -1.2690541744232178, + "logps/chosen": -0.8825578689575195, + "logps/rejected": -1.1409144401550293, + "loss": 0.8776, + "odds_ratio_loss": 0.7036653757095337, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04412789270281792, + "rewards/margins": 0.01291782595217228, + "rewards/rejected": -0.057045720517635345, + "sft_loss": 0.8825578689575195, + "step": 2050 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 5.31433548093374, + "learning_rate": 2.517453150744904e-06, + "logits/chosen": -0.8365262746810913, + "logits/rejected": -0.594868540763855, + "logps/chosen": -0.7498319745063782, + "logps/rejected": -1.5040110349655151, + "loss": 0.7332, + "odds_ratio_loss": 0.33074039220809937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03749159723520279, + "rewards/margins": 0.03770895302295685, + "rewards/rejected": -0.07520055025815964, + "sft_loss": 0.7498319745063782, + "step": 2055 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 6.655827286565878, + "learning_rate": 2.5058177589223766e-06, + "logits/chosen": -0.8108224868774414, + "logits/rejected": -0.8111976385116577, + "logps/chosen": -0.8283792734146118, + "logps/rejected": -1.2744197845458984, + "loss": 0.874, + "odds_ratio_loss": 0.4357910752296448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04141896218061447, + "rewards/margins": 0.022302016615867615, + "rewards/rejected": -0.06372098624706268, + "sft_loss": 0.8283792734146118, + "step": 2060 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 8.036123405257694, + "learning_rate": 2.4941822410776247e-06, + "logits/chosen": -1.0825451612472534, + "logits/rejected": -1.170818567276001, + "logps/chosen": -0.9788244366645813, + "logps/rejected": -1.9613186120986938, + "loss": 0.7772, + "odds_ratio_loss": 0.3012261986732483, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04894121736288071, + "rewards/margins": 0.04912471026182175, + "rewards/rejected": -0.09806593507528305, + "sft_loss": 0.9788244366645813, + "step": 2065 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 6.061444627487459, + "learning_rate": 2.482546849255096e-06, + "logits/chosen": -0.7679504752159119, + "logits/rejected": -0.9109829664230347, + "logps/chosen": -0.6975770592689514, + "logps/rejected": -1.4365979433059692, + "loss": 0.8016, + "odds_ratio_loss": 0.29892677068710327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03487885370850563, + "rewards/margins": 0.03695103898644447, + "rewards/rejected": -0.0718298926949501, + "sft_loss": 0.6975770592689514, + "step": 2070 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 4.509609302945273, + "learning_rate": 2.470911835496508e-06, + "logits/chosen": -0.9424687623977661, + "logits/rejected": -1.1301240921020508, + "logps/chosen": -0.6970169544219971, + "logps/rejected": -1.6458070278167725, + "loss": 0.8016, + "odds_ratio_loss": 0.3055366277694702, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034850846976041794, + "rewards/margins": 0.04743950814008713, + "rewards/rejected": -0.08229035139083862, + "sft_loss": 0.6970169544219971, + "step": 2075 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 7.765550436552941, + "learning_rate": 2.4592774518353858e-06, + "logits/chosen": -0.8300328254699707, + "logits/rejected": -1.3914577960968018, + "logps/chosen": -0.7543548345565796, + "logps/rejected": -1.325756311416626, + "loss": 0.8444, + "odds_ratio_loss": 0.37441879510879517, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03771774470806122, + "rewards/margins": 0.028570080175995827, + "rewards/rejected": -0.0662878230214119, + "sft_loss": 0.7543548345565796, + "step": 2080 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 5.908452371029711, + "learning_rate": 2.447643950291608e-06, + "logits/chosen": -0.6916254758834839, + "logits/rejected": -1.0722490549087524, + "logps/chosen": -0.8863399624824524, + "logps/rejected": -1.6229625940322876, + "loss": 0.7622, + "odds_ratio_loss": 0.4502854347229004, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04431699588894844, + "rewards/margins": 0.036831121891736984, + "rewards/rejected": -0.08114812523126602, + "sft_loss": 0.8863399624824524, + "step": 2085 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 12.568326286094635, + "learning_rate": 2.436011582865945e-06, + "logits/chosen": -0.764611542224884, + "logits/rejected": -1.219498872756958, + "logps/chosen": -0.8522571325302124, + "logps/rejected": -1.5530402660369873, + "loss": 0.8979, + "odds_ratio_loss": 0.4483674466609955, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04261285811662674, + "rewards/margins": 0.035039156675338745, + "rewards/rejected": -0.07765202224254608, + "sft_loss": 0.8522571325302124, + "step": 2090 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 5.284478090573637, + "learning_rate": 2.4243806015345988e-06, + "logits/chosen": -1.1326720714569092, + "logits/rejected": -0.9926662445068359, + "logps/chosen": -0.636461615562439, + "logps/rejected": -1.5491859912872314, + "loss": 0.7837, + "odds_ratio_loss": 0.3587147295475006, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03182308003306389, + "rewards/margins": 0.04563622921705246, + "rewards/rejected": -0.07745930552482605, + "sft_loss": 0.636461615562439, + "step": 2095 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 5.748225305959376, + "learning_rate": 2.4127512582437486e-06, + "logits/chosen": -0.8698973655700684, + "logits/rejected": -1.1889925003051758, + "logps/chosen": -0.825188934803009, + "logps/rejected": -1.2668731212615967, + "loss": 0.916, + "odds_ratio_loss": 0.5938987731933594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.041259441524744034, + "rewards/margins": 0.022084210067987442, + "rewards/rejected": -0.06334365904331207, + "sft_loss": 0.825188934803009, + "step": 2100 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 13.816645361412702, + "learning_rate": 2.4011238049040875e-06, + "logits/chosen": -0.9653146862983704, + "logits/rejected": -1.273425817489624, + "logps/chosen": -0.48048973083496094, + "logps/rejected": -1.9898446798324585, + "loss": 0.7611, + "odds_ratio_loss": 0.20829610526561737, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.024024488404393196, + "rewards/margins": 0.07546775043010712, + "rewards/rejected": -0.09949223697185516, + "sft_loss": 0.48048973083496094, + "step": 2105 + }, + { + "epoch": 1.688, + "grad_norm": 7.128418390551076, + "learning_rate": 2.3894984933853734e-06, + "logits/chosen": -1.2187275886535645, + "logits/rejected": -0.9569811820983887, + "logps/chosen": -0.7580611109733582, + "logps/rejected": -1.5453588962554932, + "loss": 0.7746, + "odds_ratio_loss": 0.32491961121559143, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.037903059273958206, + "rewards/margins": 0.03936489298939705, + "rewards/rejected": -0.07726794481277466, + "sft_loss": 0.7580611109733582, + "step": 2110 + }, + { + "epoch": 1.692, + "grad_norm": 9.905564606255297, + "learning_rate": 2.377875575510967e-06, + "logits/chosen": -0.6468038558959961, + "logits/rejected": -1.1671807765960693, + "logps/chosen": -0.8859814405441284, + "logps/rejected": -1.915808916091919, + "loss": 0.8705, + "odds_ratio_loss": 0.3132167458534241, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04429907351732254, + "rewards/margins": 0.05149136856198311, + "rewards/rejected": -0.09579044580459595, + "sft_loss": 0.8859814405441284, + "step": 2115 + }, + { + "epoch": 1.696, + "grad_norm": 5.125087558911786, + "learning_rate": 2.366255303052377e-06, + "logits/chosen": -0.7792321443557739, + "logits/rejected": -0.8570452928543091, + "logps/chosen": -1.0220623016357422, + "logps/rejected": -1.2880709171295166, + "loss": 0.7697, + "odds_ratio_loss": 0.534646213054657, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05110311508178711, + "rewards/margins": 0.013300428166985512, + "rewards/rejected": -0.06440354138612747, + "sft_loss": 1.0220623016357422, + "step": 2120 + }, + { + "epoch": 1.7, + "grad_norm": 7.717361248878697, + "learning_rate": 2.3546379277238107e-06, + "logits/chosen": -0.8779585957527161, + "logits/rejected": -1.2526636123657227, + "logps/chosen": -1.0405645370483398, + "logps/rejected": -1.4684456586837769, + "loss": 0.8437, + "odds_ratio_loss": 0.5074051022529602, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05202822759747505, + "rewards/margins": 0.02139405533671379, + "rewards/rejected": -0.07342227548360825, + "sft_loss": 1.0405645370483398, + "step": 2125 + }, + { + "epoch": 1.704, + "grad_norm": 8.595275466816915, + "learning_rate": 2.3430237011767166e-06, + "logits/chosen": -0.7775979042053223, + "logits/rejected": -1.551299810409546, + "logps/chosen": -0.7821834683418274, + "logps/rejected": -2.03926944732666, + "loss": 0.8241, + "odds_ratio_loss": 0.37603259086608887, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03910917788743973, + "rewards/margins": 0.06285430490970612, + "rewards/rejected": -0.10196347534656525, + "sft_loss": 0.7821834683418274, + "step": 2130 + }, + { + "epoch": 1.708, + "grad_norm": 5.594173003610877, + "learning_rate": 2.3314128749943376e-06, + "logits/chosen": -1.2585442066192627, + "logits/rejected": -0.8739719390869141, + "logps/chosen": -1.027630090713501, + "logps/rejected": -1.4903634786605835, + "loss": 0.7552, + "odds_ratio_loss": 0.51094651222229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05138150602579117, + "rewards/margins": 0.023136669769883156, + "rewards/rejected": -0.07451816648244858, + "sft_loss": 1.027630090713501, + "step": 2135 + }, + { + "epoch": 1.712, + "grad_norm": 6.179490163786233, + "learning_rate": 2.319805700686257e-06, + "logits/chosen": -0.7619789838790894, + "logits/rejected": -1.402626633644104, + "logps/chosen": -1.0751920938491821, + "logps/rejected": -1.9567829370498657, + "loss": 0.8477, + "odds_ratio_loss": 0.3091539442539215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053759604692459106, + "rewards/margins": 0.04407954216003418, + "rewards/rejected": -0.09783915430307388, + "sft_loss": 1.0751920938491821, + "step": 2140 + }, + { + "epoch": 1.716, + "grad_norm": 5.014263826927916, + "learning_rate": 2.3082024296829538e-06, + "logits/chosen": -1.2588303089141846, + "logits/rejected": -0.8437775373458862, + "logps/chosen": -0.7372163534164429, + "logps/rejected": -1.173533320426941, + "loss": 0.7309, + "odds_ratio_loss": 0.4651245176792145, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.036860816180706024, + "rewards/margins": 0.021815849468111992, + "rewards/rejected": -0.058676671236753464, + "sft_loss": 0.7372163534164429, + "step": 2145 + }, + { + "epoch": 1.72, + "grad_norm": 12.671731572893654, + "learning_rate": 2.296603313330355e-06, + "logits/chosen": -0.940277099609375, + "logits/rejected": -0.9961813688278198, + "logps/chosen": -0.6746289134025574, + "logps/rejected": -1.4196538925170898, + "loss": 0.79, + "odds_ratio_loss": 0.29419276118278503, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.033731453120708466, + "rewards/margins": 0.037251245230436325, + "rewards/rejected": -0.07098269462585449, + "sft_loss": 0.6746289134025574, + "step": 2150 + }, + { + "epoch": 1.724, + "grad_norm": 8.807564564413038, + "learning_rate": 2.2850086028843894e-06, + "logits/chosen": -0.5990036129951477, + "logits/rejected": -1.1123206615447998, + "logps/chosen": -0.7771926522254944, + "logps/rejected": -1.3297137022018433, + "loss": 0.7942, + "odds_ratio_loss": 0.3976249098777771, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03885963559150696, + "rewards/margins": 0.027626052498817444, + "rewards/rejected": -0.0664856880903244, + "sft_loss": 0.7771926522254944, + "step": 2155 + }, + { + "epoch": 1.728, + "grad_norm": 6.458841654862515, + "learning_rate": 2.2734185495055503e-06, + "logits/chosen": -1.034919261932373, + "logits/rejected": -0.8573856353759766, + "logps/chosen": -0.7438480257987976, + "logps/rejected": -1.2708542346954346, + "loss": 0.6921, + "odds_ratio_loss": 0.4702334403991699, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03719240427017212, + "rewards/margins": 0.026350298896431923, + "rewards/rejected": -0.06354270875453949, + "sft_loss": 0.7438480257987976, + "step": 2160 + }, + { + "epoch": 1.732, + "grad_norm": 10.704475448149418, + "learning_rate": 2.2618334042534464e-06, + "logits/chosen": -0.9798957705497742, + "logits/rejected": -0.9897342920303345, + "logps/chosen": -0.6244773864746094, + "logps/rejected": -1.1638376712799072, + "loss": 0.7625, + "odds_ratio_loss": 0.35523417592048645, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03122386895120144, + "rewards/margins": 0.026968013495206833, + "rewards/rejected": -0.05819188430905342, + "sft_loss": 0.6244773864746094, + "step": 2165 + }, + { + "epoch": 1.736, + "grad_norm": 8.192553193640592, + "learning_rate": 2.250253418081373e-06, + "logits/chosen": -0.960060715675354, + "logits/rejected": -0.8580840229988098, + "logps/chosen": -0.7168663740158081, + "logps/rejected": -1.152551531791687, + "loss": 0.7857, + "odds_ratio_loss": 0.495716392993927, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03584332391619682, + "rewards/margins": 0.021784260869026184, + "rewards/rejected": -0.05762758105993271, + "sft_loss": 0.7168663740158081, + "step": 2170 + }, + { + "epoch": 1.74, + "grad_norm": 8.321261439994862, + "learning_rate": 2.238678841830867e-06, + "logits/chosen": -0.7686837911605835, + "logits/rejected": -1.141378402709961, + "logps/chosen": -1.0380868911743164, + "logps/rejected": -1.489746332168579, + "loss": 0.806, + "odds_ratio_loss": 0.4990416467189789, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0519043430685997, + "rewards/margins": 0.022582972422242165, + "rewards/rejected": -0.07448731362819672, + "sft_loss": 1.0380868911743164, + "step": 2175 + }, + { + "epoch": 1.744, + "grad_norm": 6.016319364654182, + "learning_rate": 2.22710992622628e-06, + "logits/chosen": -1.1537871360778809, + "logits/rejected": -0.9235156774520874, + "logps/chosen": -1.0170602798461914, + "logps/rejected": -2.0361647605895996, + "loss": 0.8776, + "odds_ratio_loss": 0.5887846350669861, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05085300654172897, + "rewards/margins": 0.05095522850751877, + "rewards/rejected": -0.10180824995040894, + "sft_loss": 1.0170602798461914, + "step": 2180 + }, + { + "epoch": 1.748, + "grad_norm": 10.90847815557463, + "learning_rate": 2.2155469218693464e-06, + "logits/chosen": -0.8062864542007446, + "logits/rejected": -0.944413959980011, + "logps/chosen": -0.8070542216300964, + "logps/rejected": -1.5685523748397827, + "loss": 0.873, + "odds_ratio_loss": 0.5560692548751831, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0403527170419693, + "rewards/margins": 0.038074906915426254, + "rewards/rejected": -0.07842762768268585, + "sft_loss": 0.8070542216300964, + "step": 2185 + }, + { + "epoch": 1.752, + "grad_norm": 12.956912471620969, + "learning_rate": 2.2039900792337477e-06, + "logits/chosen": -0.9245501756668091, + "logits/rejected": -0.7286485433578491, + "logps/chosen": -1.1378872394561768, + "logps/rejected": -1.3941930532455444, + "loss": 0.8626, + "odds_ratio_loss": 0.5717015266418457, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.056894369423389435, + "rewards/margins": 0.012815283611416817, + "rewards/rejected": -0.0697096511721611, + "sft_loss": 1.1378872394561768, + "step": 2190 + }, + { + "epoch": 1.756, + "grad_norm": 11.281398520267489, + "learning_rate": 2.192439648659699e-06, + "logits/chosen": -1.042218804359436, + "logits/rejected": -0.6264259219169617, + "logps/chosen": -1.035028338432312, + "logps/rejected": -1.542610764503479, + "loss": 0.8083, + "odds_ratio_loss": 0.5132459998130798, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05175141617655754, + "rewards/margins": 0.025379126891493797, + "rewards/rejected": -0.07713054120540619, + "sft_loss": 1.035028338432312, + "step": 2195 + }, + { + "epoch": 1.76, + "grad_norm": 7.2793892053250735, + "learning_rate": 2.1808958803485134e-06, + "logits/chosen": -1.051295280456543, + "logits/rejected": -0.6670552492141724, + "logps/chosen": -0.6154602766036987, + "logps/rejected": -1.3951618671417236, + "loss": 0.8288, + "odds_ratio_loss": 0.2627789378166199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030773013830184937, + "rewards/margins": 0.038985081017017365, + "rewards/rejected": -0.0697580948472023, + "sft_loss": 0.6154602766036987, + "step": 2200 + }, + { + "epoch": 1.764, + "grad_norm": 6.172083283993613, + "learning_rate": 2.1693590243571937e-06, + "logits/chosen": -0.8260824084281921, + "logits/rejected": -1.0545563697814941, + "logps/chosen": -0.8104951977729797, + "logps/rejected": -1.3007080554962158, + "loss": 0.7999, + "odds_ratio_loss": 0.3759697377681732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04052475839853287, + "rewards/margins": 0.024510642513632774, + "rewards/rejected": -0.0650353953242302, + "sft_loss": 0.8104951977729797, + "step": 2205 + }, + { + "epoch": 1.768, + "grad_norm": 5.796258673720241, + "learning_rate": 2.157829330593008e-06, + "logits/chosen": -0.5958111882209778, + "logits/rejected": -1.353058099746704, + "logps/chosen": -0.7925723791122437, + "logps/rejected": -1.575204610824585, + "loss": 0.8336, + "odds_ratio_loss": 0.31617340445518494, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03962862119078636, + "rewards/margins": 0.03913161903619766, + "rewards/rejected": -0.07876023650169373, + "sft_loss": 0.7925723791122437, + "step": 2210 + }, + { + "epoch": 1.772, + "grad_norm": 5.853650964962732, + "learning_rate": 2.1463070488080777e-06, + "logits/chosen": -0.7320128679275513, + "logits/rejected": -1.4626697301864624, + "logps/chosen": -0.920146107673645, + "logps/rejected": -1.587083101272583, + "loss": 0.912, + "odds_ratio_loss": 0.36855971813201904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.046007297933101654, + "rewards/margins": 0.03334684669971466, + "rewards/rejected": -0.07935415208339691, + "sft_loss": 0.920146107673645, + "step": 2215 + }, + { + "epoch": 1.776, + "grad_norm": 7.102475221339332, + "learning_rate": 2.134792428593971e-06, + "logits/chosen": -1.0143307447433472, + "logits/rejected": -1.0723531246185303, + "logps/chosen": -0.6754562854766846, + "logps/rejected": -1.2191624641418457, + "loss": 0.8002, + "odds_ratio_loss": 0.4698086380958557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03377281501889229, + "rewards/margins": 0.027185309678316116, + "rewards/rejected": -0.060958124697208405, + "sft_loss": 0.6754562854766846, + "step": 2220 + }, + { + "epoch": 1.78, + "grad_norm": 5.132375215504824, + "learning_rate": 2.1232857193762923e-06, + "logits/chosen": -0.8438177108764648, + "logits/rejected": -0.8187214136123657, + "logps/chosen": -0.9790526628494263, + "logps/rejected": -1.4144220352172852, + "loss": 0.8596, + "odds_ratio_loss": 0.5008438229560852, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.048952627927064896, + "rewards/margins": 0.021768469363451004, + "rewards/rejected": -0.0707211047410965, + "sft_loss": 0.9790526628494263, + "step": 2225 + }, + { + "epoch": 1.784, + "grad_norm": 5.864011412934527, + "learning_rate": 2.1117871704092818e-06, + "logits/chosen": -0.6806105375289917, + "logits/rejected": -1.0006288290023804, + "logps/chosen": -0.6304813027381897, + "logps/rejected": -2.105440378189087, + "loss": 0.8079, + "odds_ratio_loss": 0.289185106754303, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.031524065881967545, + "rewards/margins": 0.07374795526266098, + "rewards/rejected": -0.10527201741933823, + "sft_loss": 0.6304813027381897, + "step": 2230 + }, + { + "epoch": 1.788, + "grad_norm": 5.2792769917747755, + "learning_rate": 2.1002970307704134e-06, + "logits/chosen": -1.001686930656433, + "logits/rejected": -0.7307732105255127, + "logps/chosen": -0.4030815660953522, + "logps/rejected": -2.1336350440979004, + "loss": 0.8382, + "odds_ratio_loss": 0.0816396027803421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020154079422354698, + "rewards/margins": 0.08652767539024353, + "rewards/rejected": -0.10668174922466278, + "sft_loss": 0.4030815660953522, + "step": 2235 + }, + { + "epoch": 1.792, + "grad_norm": 6.219063011453153, + "learning_rate": 2.0888155493550027e-06, + "logits/chosen": -0.6964353322982788, + "logits/rejected": -0.9690049290657043, + "logps/chosen": -0.9550386667251587, + "logps/rejected": -1.2657750844955444, + "loss": 0.9063, + "odds_ratio_loss": 0.5276353359222412, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.047751929610967636, + "rewards/margins": 0.015536829829216003, + "rewards/rejected": -0.06328876316547394, + "sft_loss": 0.9550386667251587, + "step": 2240 + }, + { + "epoch": 1.796, + "grad_norm": 6.217554170161715, + "learning_rate": 2.0773429748708153e-06, + "logits/chosen": -1.1106126308441162, + "logits/rejected": -0.8946078419685364, + "logps/chosen": -0.7019718289375305, + "logps/rejected": -2.1581504344940186, + "loss": 0.7725, + "odds_ratio_loss": 0.34209513664245605, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.035098589956760406, + "rewards/margins": 0.07280893623828888, + "rewards/rejected": -0.10790753364562988, + "sft_loss": 0.7019718289375305, + "step": 2245 + }, + { + "epoch": 1.8, + "grad_norm": 6.803417111045092, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.081549048423767, + "logits/rejected": -0.8075908422470093, + "logps/chosen": -1.0295336246490479, + "logps/rejected": -1.406858205795288, + "loss": 0.8343, + "odds_ratio_loss": 0.48274731636047363, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05147667974233627, + "rewards/margins": 0.018866227939724922, + "rewards/rejected": -0.07034290581941605, + "sft_loss": 1.0295336246490479, + "step": 2250 + }, + { + "epoch": 1.804, + "grad_norm": 4.989031637803273, + "learning_rate": 2.0544255405570843e-06, + "logits/chosen": -0.9463821649551392, + "logits/rejected": -1.047937035560608, + "logps/chosen": -0.6406688690185547, + "logps/rejected": -1.53047776222229, + "loss": 0.8074, + "odds_ratio_loss": 0.3679446578025818, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.032033443450927734, + "rewards/margins": 0.04449043795466423, + "rewards/rejected": -0.07652387768030167, + "sft_loss": 0.6406688690185547, + "step": 2255 + }, + { + "epoch": 1.808, + "grad_norm": 6.658453228798975, + "learning_rate": 2.0429811771568468e-06, + "logits/chosen": -0.7510989308357239, + "logits/rejected": -1.236093282699585, + "logps/chosen": -0.8555262684822083, + "logps/rejected": -1.224966049194336, + "loss": 0.8413, + "odds_ratio_loss": 0.4976680874824524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04277631267905235, + "rewards/margins": 0.01847199536859989, + "rewards/rejected": -0.061248309910297394, + "sft_loss": 0.8555262684822083, + "step": 2260 + }, + { + "epoch": 1.812, + "grad_norm": 4.401496364800286, + "learning_rate": 2.031546713535688e-06, + "logits/chosen": -0.9858955144882202, + "logits/rejected": -0.9150092005729675, + "logps/chosen": -0.7890244722366333, + "logps/rejected": -1.2879811525344849, + "loss": 0.7905, + "odds_ratio_loss": 0.3429797291755676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039451222866773605, + "rewards/margins": 0.02494782581925392, + "rewards/rejected": -0.06439904868602753, + "sft_loss": 0.7890244722366333, + "step": 2265 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 9.174834943476265, + "learning_rate": 2.0201223973828917e-06, + "logits/chosen": -0.8151395916938782, + "logits/rejected": -1.2962208986282349, + "logps/chosen": -0.7152162790298462, + "logps/rejected": -1.581583857536316, + "loss": 0.7903, + "odds_ratio_loss": 0.385868102312088, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03576081246137619, + "rewards/margins": 0.04331838712096214, + "rewards/rejected": -0.07907919585704803, + "sft_loss": 0.7152162790298462, + "step": 2270 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 6.3991437024550475, + "learning_rate": 2.0087084761679245e-06, + "logits/chosen": -0.8196107149124146, + "logits/rejected": -1.1247520446777344, + "logps/chosen": -0.8312146067619324, + "logps/rejected": -1.3900336027145386, + "loss": 0.7936, + "odds_ratio_loss": 0.44879013299942017, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04156073182821274, + "rewards/margins": 0.027940943837165833, + "rewards/rejected": -0.06950168311595917, + "sft_loss": 0.8312146067619324, + "step": 2275 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 7.113083017693733, + "learning_rate": 1.997305197135089e-06, + "logits/chosen": -0.6806960701942444, + "logits/rejected": -0.8038619160652161, + "logps/chosen": -0.6782374382019043, + "logps/rejected": -1.2599354982376099, + "loss": 0.8263, + "odds_ratio_loss": 0.379936158657074, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.033911872655153275, + "rewards/margins": 0.02908490039408207, + "rewards/rejected": -0.0629967749118805, + "sft_loss": 0.6782374382019043, + "step": 2280 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 7.6235951885998015, + "learning_rate": 1.985912807298154e-06, + "logits/chosen": -1.0690265893936157, + "logits/rejected": -0.9779754877090454, + "logps/chosen": -0.7425954937934875, + "logps/rejected": -1.426913857460022, + "loss": 0.7348, + "odds_ratio_loss": 0.357009619474411, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03712977468967438, + "rewards/margins": 0.03421591967344284, + "rewards/rejected": -0.07134570181369781, + "sft_loss": 0.7425954937934875, + "step": 2285 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 5.615671437369734, + "learning_rate": 1.9745315534350157e-06, + "logits/chosen": -0.9129589200019836, + "logits/rejected": -0.6167286038398743, + "logps/chosen": -1.065497636795044, + "logps/rejected": -1.2868871688842773, + "loss": 0.8386, + "odds_ratio_loss": 0.6246089339256287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05327488109469414, + "rewards/margins": 0.011069480329751968, + "rewards/rejected": -0.0643443614244461, + "sft_loss": 1.065497636795044, + "step": 2290 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 6.084128369072109, + "learning_rate": 1.963161682082342e-06, + "logits/chosen": -0.9707215428352356, + "logits/rejected": -0.7441933751106262, + "logps/chosen": -0.9074664115905762, + "logps/rejected": -1.3849105834960938, + "loss": 0.8658, + "odds_ratio_loss": 0.4366206228733063, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04537331685423851, + "rewards/margins": 0.023872217163443565, + "rewards/rejected": -0.06924553215503693, + "sft_loss": 0.9074664115905762, + "step": 2295 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 7.352748854504632, + "learning_rate": 1.9518034395302413e-06, + "logits/chosen": -0.7245827317237854, + "logits/rejected": -1.2959226369857788, + "logps/chosen": -0.7377294301986694, + "logps/rejected": -1.4373664855957031, + "loss": 0.9049, + "odds_ratio_loss": 0.33754152059555054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03688646852970123, + "rewards/margins": 0.034981850534677505, + "rewards/rejected": -0.07186831533908844, + "sft_loss": 0.7377294301986694, + "step": 2300 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 4.828613164349784, + "learning_rate": 1.940457071816922e-06, + "logits/chosen": -0.7781765460968018, + "logits/rejected": -1.140925407409668, + "logps/chosen": -0.775480329990387, + "logps/rejected": -1.096097707748413, + "loss": 0.8761, + "odds_ratio_loss": 0.48161450028419495, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03877401351928711, + "rewards/margins": 0.016030868515372276, + "rewards/rejected": -0.054804880172014236, + "sft_loss": 0.775480329990387, + "step": 2305 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 6.702848454346183, + "learning_rate": 1.9291228247233607e-06, + "logits/chosen": -0.786065936088562, + "logits/rejected": -1.1700165271759033, + "logps/chosen": -0.8791561126708984, + "logps/rejected": -1.355006217956543, + "loss": 0.7591, + "odds_ratio_loss": 0.46694913506507874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04395780712366104, + "rewards/margins": 0.023792508989572525, + "rewards/rejected": -0.06775031983852386, + "sft_loss": 0.8791561126708984, + "step": 2310 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 7.711938205346599, + "learning_rate": 1.9178009437679855e-06, + "logits/chosen": -1.0333006381988525, + "logits/rejected": -0.5370423197746277, + "logps/chosen": -0.3634825050830841, + "logps/rejected": -1.1746513843536377, + "loss": 0.7101, + "odds_ratio_loss": 0.2676122784614563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018174124881625175, + "rewards/margins": 0.04055844619870186, + "rewards/rejected": -0.05873257666826248, + "sft_loss": 0.3634825050830841, + "step": 2315 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 8.761311945720486, + "learning_rate": 1.9064916742013515e-06, + "logits/chosen": -0.33012256026268005, + "logits/rejected": -1.075441598892212, + "logps/chosen": -0.9848030209541321, + "logps/rejected": -1.3406559228897095, + "loss": 0.8125, + "odds_ratio_loss": 0.5339032411575317, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04924015700817108, + "rewards/margins": 0.01779264211654663, + "rewards/rejected": -0.06703279912471771, + "sft_loss": 0.9848030209541321, + "step": 2320 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 8.996579586998216, + "learning_rate": 1.895195261000831e-06, + "logits/chosen": -1.030765414237976, + "logits/rejected": -0.9685811996459961, + "logps/chosen": -0.8394074440002441, + "logps/rejected": -1.4255101680755615, + "loss": 0.7645, + "odds_ratio_loss": 0.4348466992378235, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04197037220001221, + "rewards/margins": 0.029305141419172287, + "rewards/rejected": -0.07127551734447479, + "sft_loss": 0.8394074440002441, + "step": 2325 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 7.188662135547713, + "learning_rate": 1.883911948865306e-06, + "logits/chosen": -0.7353323698043823, + "logits/rejected": -0.9462113380432129, + "logps/chosen": -0.6895793676376343, + "logps/rejected": -1.388048529624939, + "loss": 0.7353, + "odds_ratio_loss": 0.40559130907058716, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03447896987199783, + "rewards/margins": 0.034923456609249115, + "rewards/rejected": -0.06940243393182755, + "sft_loss": 0.6895793676376343, + "step": 2330 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 7.322809013987031, + "learning_rate": 1.872641982209868e-06, + "logits/chosen": -0.9933965802192688, + "logits/rejected": -0.6932544112205505, + "logps/chosen": -0.6619799137115479, + "logps/rejected": -1.2481772899627686, + "loss": 0.7704, + "odds_ratio_loss": 0.38309139013290405, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.033098991960287094, + "rewards/margins": 0.029309872537851334, + "rewards/rejected": -0.06240885704755783, + "sft_loss": 0.6619799137115479, + "step": 2335 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 4.882755527025201, + "learning_rate": 1.8613856051605242e-06, + "logits/chosen": -0.7146080732345581, + "logits/rejected": -1.3212826251983643, + "logps/chosen": -0.970722496509552, + "logps/rejected": -1.4650259017944336, + "loss": 0.7754, + "odds_ratio_loss": 0.4080827236175537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04853612929582596, + "rewards/margins": 0.02471516653895378, + "rewards/rejected": -0.07325129956007004, + "sft_loss": 0.970722496509552, + "step": 2340 + }, + { + "epoch": 1.876, + "grad_norm": 6.023565399066244, + "learning_rate": 1.850143061548907e-06, + "logits/chosen": -0.8632639050483704, + "logits/rejected": -1.1179392337799072, + "logps/chosen": -0.7041983008384705, + "logps/rejected": -2.4500975608825684, + "loss": 0.815, + "odds_ratio_loss": 0.27313196659088135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03520992025732994, + "rewards/margins": 0.08729497343301773, + "rewards/rejected": -0.12250488996505737, + "sft_loss": 0.7041983008384705, + "step": 2345 + }, + { + "epoch": 1.88, + "grad_norm": 7.758596635756381, + "learning_rate": 1.8389145949069953e-06, + "logits/chosen": -1.2442716360092163, + "logits/rejected": -0.7619195580482483, + "logps/chosen": -0.8322132229804993, + "logps/rejected": -1.4952902793884277, + "loss": 0.9443, + "odds_ratio_loss": 0.39093518257141113, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.041610658168792725, + "rewards/margins": 0.03315385431051254, + "rewards/rejected": -0.07476451992988586, + "sft_loss": 0.8322132229804993, + "step": 2350 + }, + { + "epoch": 1.884, + "grad_norm": 6.603546187128215, + "learning_rate": 1.827700448461836e-06, + "logits/chosen": -0.8992365598678589, + "logits/rejected": -0.41766390204429626, + "logps/chosen": -0.7582792043685913, + "logps/rejected": -1.665892243385315, + "loss": 0.8475, + "odds_ratio_loss": 0.422523558139801, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0379139669239521, + "rewards/margins": 0.04538065940141678, + "rewards/rejected": -0.08329462260007858, + "sft_loss": 0.7582792043685913, + "step": 2355 + }, + { + "epoch": 1.888, + "grad_norm": 4.094000393336399, + "learning_rate": 1.816500865130279e-06, + "logits/chosen": -1.0214028358459473, + "logits/rejected": -0.9226012229919434, + "logps/chosen": -0.78780597448349, + "logps/rejected": -1.5264439582824707, + "loss": 0.7332, + "odds_ratio_loss": 0.39362823963165283, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03939030319452286, + "rewards/margins": 0.036931902170181274, + "rewards/rejected": -0.07632219791412354, + "sft_loss": 0.78780597448349, + "step": 2360 + }, + { + "epoch": 1.892, + "grad_norm": 5.145690592897293, + "learning_rate": 1.8053160875137137e-06, + "logits/chosen": -0.9314631223678589, + "logits/rejected": -0.9437487721443176, + "logps/chosen": -1.0313526391983032, + "logps/rejected": -1.4607595205307007, + "loss": 0.8259, + "odds_ratio_loss": 0.5963125228881836, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05156763270497322, + "rewards/margins": 0.021470338106155396, + "rewards/rejected": -0.07303796708583832, + "sft_loss": 1.0313526391983032, + "step": 2365 + }, + { + "epoch": 1.896, + "grad_norm": 7.417881702928459, + "learning_rate": 1.7941463578928088e-06, + "logits/chosen": -0.797860324382782, + "logits/rejected": -1.1919324398040771, + "logps/chosen": -0.879246711730957, + "logps/rejected": -1.3754128217697144, + "loss": 0.8715, + "odds_ratio_loss": 0.4276247024536133, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04396234080195427, + "rewards/margins": 0.024808308109641075, + "rewards/rejected": -0.0687706395983696, + "sft_loss": 0.879246711730957, + "step": 2370 + }, + { + "epoch": 1.9, + "grad_norm": 7.132781683007218, + "learning_rate": 1.7829919182222752e-06, + "logits/chosen": -0.972023606300354, + "logits/rejected": -1.1046335697174072, + "logps/chosen": -0.8717896342277527, + "logps/rejected": -1.0692293643951416, + "loss": 0.8009, + "odds_ratio_loss": 0.679745078086853, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.043589480221271515, + "rewards/margins": 0.009871991351246834, + "rewards/rejected": -0.0534614734351635, + "sft_loss": 0.8717896342277527, + "step": 2375 + }, + { + "epoch": 1.904, + "grad_norm": 4.092969776254415, + "learning_rate": 1.7718530101256115e-06, + "logits/chosen": -1.1578160524368286, + "logits/rejected": -1.0545146465301514, + "logps/chosen": -0.47031641006469727, + "logps/rejected": -1.8584105968475342, + "loss": 0.7973, + "odds_ratio_loss": 0.1941300332546234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023515818640589714, + "rewards/margins": 0.0694047138094902, + "rewards/rejected": -0.09292052686214447, + "sft_loss": 0.47031641006469727, + "step": 2380 + }, + { + "epoch": 1.908, + "grad_norm": 6.116647440253481, + "learning_rate": 1.7607298748898844e-06, + "logits/chosen": -1.042887568473816, + "logits/rejected": -0.9683266878128052, + "logps/chosen": -0.7862476706504822, + "logps/rejected": -1.1969187259674072, + "loss": 0.8179, + "odds_ratio_loss": 0.5028802752494812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03931238502264023, + "rewards/margins": 0.020533554255962372, + "rewards/rejected": -0.0598459430038929, + "sft_loss": 0.7862476706504822, + "step": 2385 + }, + { + "epoch": 1.912, + "grad_norm": 5.228060511447272, + "learning_rate": 1.7496227534604859e-06, + "logits/chosen": -0.8977655172348022, + "logits/rejected": -1.0492327213287354, + "logps/chosen": -0.6170272827148438, + "logps/rejected": -1.3571946620941162, + "loss": 0.7784, + "odds_ratio_loss": 0.2826383411884308, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030851364135742188, + "rewards/margins": 0.0370083749294281, + "rewards/rejected": -0.06785973161458969, + "sft_loss": 0.6170272827148438, + "step": 2390 + }, + { + "epoch": 1.916, + "grad_norm": 10.090456032598201, + "learning_rate": 1.7385318864359304e-06, + "logits/chosen": -0.9312974214553833, + "logits/rejected": -1.235394835472107, + "logps/chosen": -0.9995916485786438, + "logps/rejected": -1.703447937965393, + "loss": 0.8131, + "odds_ratio_loss": 0.43445831537246704, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04997957870364189, + "rewards/margins": 0.0351928174495697, + "rewards/rejected": -0.08517240732908249, + "sft_loss": 0.9995916485786438, + "step": 2395 + }, + { + "epoch": 1.92, + "grad_norm": 5.661494882202969, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -1.0951160192489624, + "logits/rejected": -0.7472076416015625, + "logps/chosen": -0.819926917552948, + "logps/rejected": -1.4353911876678467, + "loss": 0.8619, + "odds_ratio_loss": 0.42657119035720825, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04099633917212486, + "rewards/margins": 0.03077322244644165, + "rewards/rejected": -0.07176957279443741, + "sft_loss": 0.819926917552948, + "step": 2400 + }, + { + "epoch": 1.924, + "grad_norm": 6.119264571326813, + "learning_rate": 1.7163998762297013e-06, + "logits/chosen": -1.046919584274292, + "logits/rejected": -1.0116114616394043, + "logps/chosen": -0.6755749583244324, + "logps/rejected": -1.557366967201233, + "loss": 0.8711, + "odds_ratio_loss": 0.2980736196041107, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03377874568104744, + "rewards/margins": 0.04408959671854973, + "rewards/rejected": -0.07786835730075836, + "sft_loss": 0.6755749583244324, + "step": 2405 + }, + { + "epoch": 1.928, + "grad_norm": 7.45326513403781, + "learning_rate": 1.7053592124637557e-06, + "logits/chosen": -0.9731330871582031, + "logits/rejected": -1.1434471607208252, + "logps/chosen": -0.6788956522941589, + "logps/rejected": -1.4451775550842285, + "loss": 0.8022, + "odds_ratio_loss": 0.31722068786621094, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03394477814435959, + "rewards/margins": 0.0383140966296196, + "rewards/rejected": -0.07225887477397919, + "sft_loss": 0.6788956522941589, + "step": 2410 + }, + { + "epoch": 1.932, + "grad_norm": 5.554908477058027, + "learning_rate": 1.6943357619237227e-06, + "logits/chosen": -0.9072321653366089, + "logits/rejected": -1.343854546546936, + "logps/chosen": -0.7743789553642273, + "logps/rejected": -1.7294988632202148, + "loss": 0.7934, + "odds_ratio_loss": 0.27553829550743103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03871894255280495, + "rewards/margins": 0.047755997627973557, + "rewards/rejected": -0.0864749401807785, + "sft_loss": 0.7743789553642273, + "step": 2415 + }, + { + "epoch": 1.936, + "grad_norm": 9.389343814394115, + "learning_rate": 1.6833297633956647e-06, + "logits/chosen": -1.0711301565170288, + "logits/rejected": -0.810192883014679, + "logps/chosen": -0.763390839099884, + "logps/rejected": -1.7045867443084717, + "loss": 0.8294, + "odds_ratio_loss": 0.2711796462535858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03816954419016838, + "rewards/margins": 0.0470597967505455, + "rewards/rejected": -0.08522933721542358, + "sft_loss": 0.763390839099884, + "step": 2420 + }, + { + "epoch": 1.94, + "grad_norm": 6.732029543770066, + "learning_rate": 1.6723414552876052e-06, + "logits/chosen": -0.570145845413208, + "logits/rejected": -1.3672704696655273, + "logps/chosen": -1.0161564350128174, + "logps/rejected": -1.465152621269226, + "loss": 0.852, + "odds_ratio_loss": 0.5028510689735413, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05080782249569893, + "rewards/margins": 0.022449806332588196, + "rewards/rejected": -0.07325764000415802, + "sft_loss": 1.0161564350128174, + "step": 2425 + }, + { + "epoch": 1.944, + "grad_norm": 6.884753284401447, + "learning_rate": 1.661371075624363e-06, + "logits/chosen": -0.8588671684265137, + "logits/rejected": -1.0025584697723389, + "logps/chosen": -0.7952778935432434, + "logps/rejected": -1.3232905864715576, + "loss": 0.8421, + "odds_ratio_loss": 0.44380640983581543, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03976389393210411, + "rewards/margins": 0.02640063688158989, + "rewards/rejected": -0.066164530813694, + "sft_loss": 0.7952778935432434, + "step": 2430 + }, + { + "epoch": 1.948, + "grad_norm": 10.032821854639238, + "learning_rate": 1.6504188620423977e-06, + "logits/chosen": -0.8894034624099731, + "logits/rejected": -1.096508502960205, + "logps/chosen": -0.9423715472221375, + "logps/rejected": -1.5046286582946777, + "loss": 0.83, + "odds_ratio_loss": 0.4647153317928314, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04711857810616493, + "rewards/margins": 0.028112854808568954, + "rewards/rejected": -0.07523143291473389, + "sft_loss": 0.9423715472221375, + "step": 2435 + }, + { + "epoch": 1.952, + "grad_norm": 8.207321665297997, + "learning_rate": 1.6394850517846621e-06, + "logits/chosen": -0.9905040860176086, + "logits/rejected": -0.9356651306152344, + "logps/chosen": -0.8063279986381531, + "logps/rejected": -1.4606654644012451, + "loss": 0.7861, + "odds_ratio_loss": 0.4356115460395813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.040316395461559296, + "rewards/margins": 0.03271687030792236, + "rewards/rejected": -0.07303327322006226, + "sft_loss": 0.8063279986381531, + "step": 2440 + }, + { + "epoch": 1.956, + "grad_norm": 6.452799460000441, + "learning_rate": 1.6285698816954626e-06, + "logits/chosen": -0.7353092432022095, + "logits/rejected": -1.1551518440246582, + "logps/chosen": -1.0718172788619995, + "logps/rejected": -1.5583527088165283, + "loss": 0.8719, + "odds_ratio_loss": 0.46111541986465454, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.05359087139368057, + "rewards/margins": 0.024326767772436142, + "rewards/rejected": -0.07791763544082642, + "sft_loss": 1.0718172788619995, + "step": 2445 + }, + { + "epoch": 1.96, + "grad_norm": 5.332170706332277, + "learning_rate": 1.6176735882153284e-06, + "logits/chosen": -0.9285340309143066, + "logits/rejected": -1.087899923324585, + "logps/chosen": -0.8756929636001587, + "logps/rejected": -1.3418347835540771, + "loss": 0.781, + "odds_ratio_loss": 0.5048962831497192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.043784648180007935, + "rewards/margins": 0.023307088762521744, + "rewards/rejected": -0.06709174066781998, + "sft_loss": 0.8756929636001587, + "step": 2450 + }, + { + "epoch": 1.964, + "grad_norm": 6.2107913026430746, + "learning_rate": 1.6067964073758901e-06, + "logits/chosen": -0.6262539625167847, + "logits/rejected": -0.9137803316116333, + "logps/chosen": -0.7144273519515991, + "logps/rejected": -1.7826179265975952, + "loss": 0.7463, + "odds_ratio_loss": 0.21934516727924347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035721369087696075, + "rewards/margins": 0.05340953543782234, + "rewards/rejected": -0.08913090080022812, + "sft_loss": 0.7144273519515991, + "step": 2455 + }, + { + "epoch": 1.968, + "grad_norm": 6.679465106655453, + "learning_rate": 1.5959385747947697e-06, + "logits/chosen": -0.6637876629829407, + "logits/rejected": -1.1082921028137207, + "logps/chosen": -0.7338643074035645, + "logps/rejected": -1.9011523723602295, + "loss": 0.8968, + "odds_ratio_loss": 0.26652342081069946, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03669321909546852, + "rewards/margins": 0.05836440995335579, + "rewards/rejected": -0.09505762159824371, + "sft_loss": 0.7338643074035645, + "step": 2460 + }, + { + "epoch": 1.972, + "grad_norm": 13.019725966030125, + "learning_rate": 1.5851003256704697e-06, + "logits/chosen": -0.9775069355964661, + "logits/rejected": -1.155165195465088, + "logps/chosen": -0.771982729434967, + "logps/rejected": -1.2554118633270264, + "loss": 0.8792, + "odds_ratio_loss": 0.5085214376449585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03859913349151611, + "rewards/margins": 0.024171466007828712, + "rewards/rejected": -0.06277060508728027, + "sft_loss": 0.771982729434967, + "step": 2465 + }, + { + "epoch": 1.976, + "grad_norm": 6.415175624114794, + "learning_rate": 1.5742818947772875e-06, + "logits/chosen": -0.5195826888084412, + "logits/rejected": -0.9917033910751343, + "logps/chosen": -0.6547525525093079, + "logps/rejected": -1.6238784790039062, + "loss": 0.7833, + "odds_ratio_loss": 0.3622708320617676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03273762762546539, + "rewards/margins": 0.04845630005002022, + "rewards/rejected": -0.08119393140077591, + "sft_loss": 0.6547525525093079, + "step": 2470 + }, + { + "epoch": 1.98, + "grad_norm": 16.881628339846156, + "learning_rate": 1.56348351646022e-06, + "logits/chosen": -1.1250643730163574, + "logits/rejected": -1.0977294445037842, + "logps/chosen": -0.7314361333847046, + "logps/rejected": -1.5915559530258179, + "loss": 0.8277, + "odds_ratio_loss": 0.3246075510978699, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03657180815935135, + "rewards/margins": 0.043005991727113724, + "rewards/rejected": -0.07957780361175537, + "sft_loss": 0.7314361333847046, + "step": 2475 + }, + { + "epoch": 1.984, + "grad_norm": 5.6365603948727, + "learning_rate": 1.552705424629898e-06, + "logits/chosen": -0.6629668474197388, + "logits/rejected": -1.1308306455612183, + "logps/chosen": -0.657507061958313, + "logps/rejected": -1.4453434944152832, + "loss": 0.8399, + "odds_ratio_loss": 0.3202642798423767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03287535160779953, + "rewards/margins": 0.03939182311296463, + "rewards/rejected": -0.07226717472076416, + "sft_loss": 0.657507061958313, + "step": 2480 + }, + { + "epoch": 1.988, + "grad_norm": 11.934551215608066, + "learning_rate": 1.5419478527575068e-06, + "logits/chosen": -1.0436184406280518, + "logits/rejected": -0.9820472002029419, + "logps/chosen": -1.0538418292999268, + "logps/rejected": -1.5438861846923828, + "loss": 0.763, + "odds_ratio_loss": 0.44970980286598206, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.05269209295511246, + "rewards/margins": 0.024502214044332504, + "rewards/rejected": -0.07719431817531586, + "sft_loss": 1.0538418292999268, + "step": 2485 + }, + { + "epoch": 1.992, + "grad_norm": 4.730272496249411, + "learning_rate": 1.5312110338697427e-06, + "logits/chosen": -0.7060618996620178, + "logits/rejected": -1.1768749952316284, + "logps/chosen": -0.6467851400375366, + "logps/rejected": -1.8637425899505615, + "loss": 0.6756, + "odds_ratio_loss": 0.24719473719596863, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03233925253152847, + "rewards/margins": 0.06084787845611572, + "rewards/rejected": -0.09318713843822479, + "sft_loss": 0.6467851400375366, + "step": 2490 + }, + { + "epoch": 1.996, + "grad_norm": 5.17945710231851, + "learning_rate": 1.520495200543754e-06, + "logits/chosen": -0.6287657022476196, + "logits/rejected": -1.2824218273162842, + "logps/chosen": -0.8325425982475281, + "logps/rejected": -1.3392772674560547, + "loss": 0.7908, + "odds_ratio_loss": 0.5226107835769653, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.041627127677202225, + "rewards/margins": 0.02533673867583275, + "rewards/rejected": -0.06696386635303497, + "sft_loss": 0.8325425982475281, + "step": 2495 + }, + { + "epoch": 2.0, + "grad_norm": 7.185045771765787, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -0.7127388119697571, + "logits/rejected": -0.9128797650337219, + "logps/chosen": -0.9189082980155945, + "logps/rejected": -1.3660955429077148, + "loss": 0.7301, + "odds_ratio_loss": 0.41850653290748596, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.045945413410663605, + "rewards/margins": 0.022359367460012436, + "rewards/rejected": -0.06830477714538574, + "sft_loss": 0.9189082980155945, + "step": 2500 + }, + { + "epoch": 2.004, + "grad_norm": 5.779855004519885, + "learning_rate": 1.4991274186077632e-06, + "logits/chosen": -0.6605402827262878, + "logits/rejected": -0.8482527732849121, + "logps/chosen": -0.4051028788089752, + "logps/rejected": -1.4240385293960571, + "loss": 0.4714, + "odds_ratio_loss": 0.22203238308429718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02025514282286167, + "rewards/margins": 0.05094677209854126, + "rewards/rejected": -0.07120192050933838, + "sft_loss": 0.4051028788089752, + "step": 2505 + }, + { + "epoch": 2.008, + "grad_norm": 5.911255709626764, + "learning_rate": 1.4884759328590476e-06, + "logits/chosen": -0.8963392376899719, + "logits/rejected": -1.2612377405166626, + "logps/chosen": -0.5547269582748413, + "logps/rejected": -1.2793185710906982, + "loss": 0.5443, + "odds_ratio_loss": 0.2325483113527298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027736347168684006, + "rewards/margins": 0.03622957691550255, + "rewards/rejected": -0.06396592408418655, + "sft_loss": 0.5547269582748413, + "step": 2510 + }, + { + "epoch": 2.012, + "grad_norm": 4.263744768867711, + "learning_rate": 1.4778463583846553e-06, + "logits/chosen": -0.6352235078811646, + "logits/rejected": -1.1501035690307617, + "logps/chosen": -0.6144863963127136, + "logps/rejected": -1.6748031377792358, + "loss": 0.5734, + "odds_ratio_loss": 0.24970802664756775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03072432242333889, + "rewards/margins": 0.05301583930850029, + "rewards/rejected": -0.08374015986919403, + "sft_loss": 0.6144863963127136, + "step": 2515 + }, + { + "epoch": 2.016, + "grad_norm": 4.168544749574423, + "learning_rate": 1.467238925438646e-06, + "logits/chosen": -0.6912413835525513, + "logits/rejected": -0.9819846153259277, + "logps/chosen": -0.5293835401535034, + "logps/rejected": -1.6525404453277588, + "loss": 0.5187, + "odds_ratio_loss": 0.2445772886276245, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02646917663514614, + "rewards/margins": 0.05615784972906113, + "rewards/rejected": -0.08262702077627182, + "sft_loss": 0.5293835401535034, + "step": 2520 + }, + { + "epoch": 2.02, + "grad_norm": 6.164215653537583, + "learning_rate": 1.4566538637954556e-06, + "logits/chosen": -1.3008034229278564, + "logits/rejected": -0.7624127864837646, + "logps/chosen": -0.40782418847084045, + "logps/rejected": -1.5719823837280273, + "loss": 0.5314, + "odds_ratio_loss": 0.13072045147418976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020391209051012993, + "rewards/margins": 0.058207906782627106, + "rewards/rejected": -0.07859911769628525, + "sft_loss": 0.40782418847084045, + "step": 2525 + }, + { + "epoch": 2.024, + "grad_norm": 4.4630247981790525, + "learning_rate": 1.446091402744923e-06, + "logits/chosen": -0.9515730738639832, + "logits/rejected": -1.204268217086792, + "logps/chosen": -0.748781144618988, + "logps/rejected": -1.6079699993133545, + "loss": 0.5591, + "odds_ratio_loss": 0.27362948656082153, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03743905574083328, + "rewards/margins": 0.04295944422483444, + "rewards/rejected": -0.08039849996566772, + "sft_loss": 0.748781144618988, + "step": 2530 + }, + { + "epoch": 2.028, + "grad_norm": 5.834985541054327, + "learning_rate": 1.4355517710873184e-06, + "logits/chosen": -0.761924147605896, + "logits/rejected": -1.0106163024902344, + "logps/chosen": -0.6143468618392944, + "logps/rejected": -1.4826791286468506, + "loss": 0.5242, + "odds_ratio_loss": 0.3327800929546356, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03071734681725502, + "rewards/margins": 0.043416619300842285, + "rewards/rejected": -0.074133962392807, + "sft_loss": 0.6143468618392944, + "step": 2535 + }, + { + "epoch": 2.032, + "grad_norm": 4.93330991376578, + "learning_rate": 1.4250351971283937e-06, + "logits/chosen": -0.7397106885910034, + "logits/rejected": -0.974991500377655, + "logps/chosen": -0.4890953600406647, + "logps/rejected": -1.5005598068237305, + "loss": 0.5051, + "odds_ratio_loss": 0.21572282910346985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024454768747091293, + "rewards/margins": 0.05057322978973389, + "rewards/rejected": -0.07502799481153488, + "sft_loss": 0.4890953600406647, + "step": 2540 + }, + { + "epoch": 2.036, + "grad_norm": 5.652910564521034, + "learning_rate": 1.41454190867443e-06, + "logits/chosen": -0.6541872024536133, + "logits/rejected": -1.2734535932540894, + "logps/chosen": -0.5036207437515259, + "logps/rejected": -1.882581353187561, + "loss": 0.4315, + "odds_ratio_loss": 0.25754693150520325, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.025181034579873085, + "rewards/margins": 0.06894804537296295, + "rewards/rejected": -0.09412907809019089, + "sft_loss": 0.5036207437515259, + "step": 2545 + }, + { + "epoch": 2.04, + "grad_norm": 5.023733060512668, + "learning_rate": 1.4040721330273063e-06, + "logits/chosen": -1.1533195972442627, + "logits/rejected": -0.9162420034408569, + "logps/chosen": -0.3805043697357178, + "logps/rejected": -1.3916971683502197, + "loss": 0.5004, + "odds_ratio_loss": 0.23528532683849335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01902521774172783, + "rewards/margins": 0.0505596399307251, + "rewards/rejected": -0.06958486139774323, + "sft_loss": 0.3805043697357178, + "step": 2550 + }, + { + "epoch": 2.044, + "grad_norm": 3.932026075807386, + "learning_rate": 1.3936260969795778e-06, + "logits/chosen": -0.8410050272941589, + "logits/rejected": -0.9105769991874695, + "logps/chosen": -0.6416537165641785, + "logps/rejected": -1.6031726598739624, + "loss": 0.5072, + "odds_ratio_loss": 0.26725801825523376, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.032082684338092804, + "rewards/margins": 0.04807594418525696, + "rewards/rejected": -0.08015862852334976, + "sft_loss": 0.6416537165641785, + "step": 2555 + }, + { + "epoch": 2.048, + "grad_norm": 6.870678133966881, + "learning_rate": 1.3832040268095589e-06, + "logits/chosen": -0.6729310750961304, + "logits/rejected": -0.8668330907821655, + "logps/chosen": -0.40369588136672974, + "logps/rejected": -1.3457889556884766, + "loss": 0.5096, + "odds_ratio_loss": 0.2014884054660797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020184790715575218, + "rewards/margins": 0.04710465669631958, + "rewards/rejected": -0.06728944927453995, + "sft_loss": 0.40369588136672974, + "step": 2560 + }, + { + "epoch": 2.052, + "grad_norm": 7.587293716398404, + "learning_rate": 1.3728061482764238e-06, + "logits/chosen": -0.843582808971405, + "logits/rejected": -0.9166304469108582, + "logps/chosen": -0.604731559753418, + "logps/rejected": -1.5095970630645752, + "loss": 0.5172, + "odds_ratio_loss": 0.2633668780326843, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03023657761514187, + "rewards/margins": 0.0452432855963707, + "rewards/rejected": -0.07547986507415771, + "sft_loss": 0.604731559753418, + "step": 2565 + }, + { + "epoch": 2.056, + "grad_norm": 4.0093961868186625, + "learning_rate": 1.362432686615316e-06, + "logits/chosen": -0.9857913851737976, + "logits/rejected": -0.7414765357971191, + "logps/chosen": -0.46555963158607483, + "logps/rejected": -1.3948227167129517, + "loss": 0.4983, + "odds_ratio_loss": 0.2733033299446106, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.023277979344129562, + "rewards/margins": 0.046463146805763245, + "rewards/rejected": -0.0697411373257637, + "sft_loss": 0.46555963158607483, + "step": 2570 + }, + { + "epoch": 2.06, + "grad_norm": 6.939466081035446, + "learning_rate": 1.3520838665324704e-06, + "logits/chosen": -1.1071815490722656, + "logits/rejected": -0.817790687084198, + "logps/chosen": -0.4723474085330963, + "logps/rejected": -1.3803852796554565, + "loss": 0.5202, + "odds_ratio_loss": 0.256916880607605, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.023617371916770935, + "rewards/margins": 0.04540189355611801, + "rewards/rejected": -0.06901925802230835, + "sft_loss": 0.4723474085330963, + "step": 2575 + }, + { + "epoch": 2.064, + "grad_norm": 4.140170449016894, + "learning_rate": 1.3417599122003464e-06, + "logits/chosen": -0.450253427028656, + "logits/rejected": -0.9482170343399048, + "logps/chosen": -0.6843463778495789, + "logps/rejected": -1.6336677074432373, + "loss": 0.5651, + "odds_ratio_loss": 0.27679651975631714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03421732038259506, + "rewards/margins": 0.047466062009334564, + "rewards/rejected": -0.08168338239192963, + "sft_loss": 0.6843463778495789, + "step": 2580 + }, + { + "epoch": 2.068, + "grad_norm": 5.1100760056211545, + "learning_rate": 1.3314610472527645e-06, + "logits/chosen": -0.8113574981689453, + "logits/rejected": -1.0102758407592773, + "logps/chosen": -0.43412670493125916, + "logps/rejected": -1.9312794208526611, + "loss": 0.4464, + "odds_ratio_loss": 0.12802623212337494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02170633338391781, + "rewards/margins": 0.07485763728618622, + "rewards/rejected": -0.09656397253274918, + "sft_loss": 0.43412670493125916, + "step": 2585 + }, + { + "epoch": 2.072, + "grad_norm": 7.56195378391692, + "learning_rate": 1.3211874947800747e-06, + "logits/chosen": -0.7834498286247253, + "logits/rejected": -0.8870722055435181, + "logps/chosen": -0.2993749678134918, + "logps/rejected": -1.3909119367599487, + "loss": 0.4961, + "odds_ratio_loss": 0.1264984905719757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014968748204410076, + "rewards/margins": 0.054576851427555084, + "rewards/rejected": -0.06954559683799744, + "sft_loss": 0.2993749678134918, + "step": 2590 + }, + { + "epoch": 2.076, + "grad_norm": 5.796090053121948, + "learning_rate": 1.3109394773243117e-06, + "logits/chosen": -0.9734644889831543, + "logits/rejected": -1.1090242862701416, + "logps/chosen": -0.729141891002655, + "logps/rejected": -1.5011317729949951, + "loss": 0.5283, + "odds_ratio_loss": 0.26416879892349243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03645709902048111, + "rewards/margins": 0.038599494844675064, + "rewards/rejected": -0.07505659013986588, + "sft_loss": 0.729141891002655, + "step": 2595 + }, + { + "epoch": 2.08, + "grad_norm": 4.905492874489777, + "learning_rate": 1.3007172168743854e-06, + "logits/chosen": -0.6443847417831421, + "logits/rejected": -0.6509965658187866, + "logps/chosen": -0.4077952802181244, + "logps/rejected": -1.5444676876068115, + "loss": 0.4459, + "odds_ratio_loss": 0.1849530041217804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02038976177573204, + "rewards/margins": 0.056833624839782715, + "rewards/rejected": -0.07722338289022446, + "sft_loss": 0.4077952802181244, + "step": 2600 + }, + { + "epoch": 2.084, + "grad_norm": 8.38385935812006, + "learning_rate": 1.2905209348612596e-06, + "logits/chosen": -1.0582338571548462, + "logits/rejected": -1.0338222980499268, + "logps/chosen": -0.3466675281524658, + "logps/rejected": -1.3382198810577393, + "loss": 0.4997, + "odds_ratio_loss": 0.17245514690876007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01733337715268135, + "rewards/margins": 0.049577612429857254, + "rewards/rejected": -0.0669109970331192, + "sft_loss": 0.3466675281524658, + "step": 2605 + }, + { + "epoch": 2.088, + "grad_norm": 4.39436373774201, + "learning_rate": 1.280350852153168e-06, + "logits/chosen": -0.9197877049446106, + "logits/rejected": -1.4192711114883423, + "logps/chosen": -0.5219482183456421, + "logps/rejected": -2.023850679397583, + "loss": 0.4535, + "odds_ratio_loss": 0.1568504273891449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026097416877746582, + "rewards/margins": 0.07509511709213257, + "rewards/rejected": -0.10119253396987915, + "sft_loss": 0.5219482183456421, + "step": 2610 + }, + { + "epoch": 2.092, + "grad_norm": 31.805321381556926, + "learning_rate": 1.2702071890508235e-06, + "logits/chosen": -0.8824436068534851, + "logits/rejected": -1.0908191204071045, + "logps/chosen": -0.49740344285964966, + "logps/rejected": -1.838228464126587, + "loss": 0.5843, + "odds_ratio_loss": 0.20825867354869843, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.024870174005627632, + "rewards/margins": 0.06704124808311462, + "rewards/rejected": -0.0919114202260971, + "sft_loss": 0.49740344285964966, + "step": 2615 + }, + { + "epoch": 2.096, + "grad_norm": 5.572501524698811, + "learning_rate": 1.260090165282645e-06, + "logits/chosen": -1.0808252096176147, + "logits/rejected": -0.8268556594848633, + "logps/chosen": -0.5673039555549622, + "logps/rejected": -1.536179780960083, + "loss": 0.4523, + "odds_ratio_loss": 0.24021944403648376, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.028365198522806168, + "rewards/margins": 0.048443786799907684, + "rewards/rejected": -0.07680898904800415, + "sft_loss": 0.5673039555549622, + "step": 2620 + }, + { + "epoch": 2.1, + "grad_norm": 5.981975287930625, + "learning_rate": 1.2500000000000007e-06, + "logits/chosen": -1.054109811782837, + "logits/rejected": -1.0586509704589844, + "logps/chosen": -0.7665703296661377, + "logps/rejected": -1.0670462846755981, + "loss": 0.5098, + "odds_ratio_loss": 0.5087045431137085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.038328517228364944, + "rewards/margins": 0.015023794956505299, + "rewards/rejected": -0.05335230752825737, + "sft_loss": 0.7665703296661377, + "step": 2625 + }, + { + "epoch": 2.104, + "grad_norm": 7.544463465155528, + "learning_rate": 1.2399369117724582e-06, + "logits/chosen": -1.0681202411651611, + "logits/rejected": -1.0832048654556274, + "logps/chosen": -0.5446698069572449, + "logps/rejected": -1.5458736419677734, + "loss": 0.4264, + "odds_ratio_loss": 0.2728939950466156, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.027233490720391273, + "rewards/margins": 0.05006019398570061, + "rewards/rejected": -0.07729368656873703, + "sft_loss": 0.5446698069572449, + "step": 2630 + }, + { + "epoch": 2.108, + "grad_norm": 4.9815831235811325, + "learning_rate": 1.2299011185830557e-06, + "logits/chosen": -1.1409013271331787, + "logits/rejected": -0.8681640625, + "logps/chosen": -0.3343183994293213, + "logps/rejected": -1.3352359533309937, + "loss": 0.5583, + "odds_ratio_loss": 0.17289471626281738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016715919598937035, + "rewards/margins": 0.05004587024450302, + "rewards/rejected": -0.0667618066072464, + "sft_loss": 0.3343183994293213, + "step": 2635 + }, + { + "epoch": 2.112, + "grad_norm": 6.73634607533909, + "learning_rate": 1.2198928378235717e-06, + "logits/chosen": -1.1178267002105713, + "logits/rejected": -0.7387592196464539, + "logps/chosen": -0.44447723031044006, + "logps/rejected": -1.3598803281784058, + "loss": 0.5268, + "odds_ratio_loss": 0.21014420688152313, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.022223861888051033, + "rewards/margins": 0.04577016085386276, + "rewards/rejected": -0.06799402087926865, + "sft_loss": 0.44447723031044006, + "step": 2640 + }, + { + "epoch": 2.116, + "grad_norm": 4.379209925386878, + "learning_rate": 1.2099122862898214e-06, + "logits/chosen": -0.9532085657119751, + "logits/rejected": -1.0121229887008667, + "logps/chosen": -0.4848550856113434, + "logps/rejected": -1.2674915790557861, + "loss": 0.4112, + "odds_ratio_loss": 0.23585955798625946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02424275316298008, + "rewards/margins": 0.039131827652454376, + "rewards/rejected": -0.0633745864033699, + "sft_loss": 0.4848550856113434, + "step": 2645 + }, + { + "epoch": 2.12, + "grad_norm": 5.5875611806210825, + "learning_rate": 1.1999596801769617e-06, + "logits/chosen": -0.9600692987442017, + "logits/rejected": -0.9517749547958374, + "logps/chosen": -0.24040523171424866, + "logps/rejected": -1.3553459644317627, + "loss": 0.5047, + "odds_ratio_loss": 0.10732688009738922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012020261958241463, + "rewards/margins": 0.05574704334139824, + "rewards/rejected": -0.06776730716228485, + "sft_loss": 0.24040523171424866, + "step": 2650 + }, + { + "epoch": 2.124, + "grad_norm": 5.665887523645101, + "learning_rate": 1.1900352350748026e-06, + "logits/chosen": -1.3023062944412231, + "logits/rejected": -0.6389235258102417, + "logps/chosen": -0.3508966863155365, + "logps/rejected": -1.516701102256775, + "loss": 0.4761, + "odds_ratio_loss": 0.20118489861488342, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.017544833943247795, + "rewards/margins": 0.05829022452235222, + "rewards/rejected": -0.07583504915237427, + "sft_loss": 0.3508966863155365, + "step": 2655 + }, + { + "epoch": 2.128, + "grad_norm": 4.849761975833439, + "learning_rate": 1.1801391659631423e-06, + "logits/chosen": -0.7300786375999451, + "logits/rejected": -1.2495789527893066, + "logps/chosen": -0.7808480262756348, + "logps/rejected": -1.481135606765747, + "loss": 0.5719, + "odds_ratio_loss": 0.3138294816017151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0390424020588398, + "rewards/margins": 0.035014379769563675, + "rewards/rejected": -0.07405678182840347, + "sft_loss": 0.7808480262756348, + "step": 2660 + }, + { + "epoch": 2.132, + "grad_norm": 7.470137375834024, + "learning_rate": 1.170271687207106e-06, + "logits/chosen": -0.8559409379959106, + "logits/rejected": -0.9146011471748352, + "logps/chosen": -0.42539578676223755, + "logps/rejected": -1.569939374923706, + "loss": 0.505, + "odds_ratio_loss": 0.17492997646331787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0212697871029377, + "rewards/margins": 0.05722718685865402, + "rewards/rejected": -0.07849697768688202, + "sft_loss": 0.42539578676223755, + "step": 2665 + }, + { + "epoch": 2.136, + "grad_norm": 4.739681467147201, + "learning_rate": 1.160433012552508e-06, + "logits/chosen": -0.571463942527771, + "logits/rejected": -1.121757984161377, + "logps/chosen": -0.5185319185256958, + "logps/rejected": -1.5073848962783813, + "loss": 0.5349, + "odds_ratio_loss": 0.20745344460010529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025926601141691208, + "rewards/margins": 0.04944263771176338, + "rewards/rejected": -0.07536924630403519, + "sft_loss": 0.5185319185256958, + "step": 2670 + }, + { + "epoch": 2.14, + "grad_norm": 4.496694368173494, + "learning_rate": 1.1506233551212186e-06, + "logits/chosen": -0.7737411856651306, + "logits/rejected": -1.0046793222427368, + "logps/chosen": -0.5965171456336975, + "logps/rejected": -1.5169528722763062, + "loss": 0.5752, + "odds_ratio_loss": 0.23371455073356628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029825860634446144, + "rewards/margins": 0.04602178931236267, + "rewards/rejected": -0.07584764063358307, + "sft_loss": 0.5965171456336975, + "step": 2675 + }, + { + "epoch": 2.144, + "grad_norm": 5.818039980551584, + "learning_rate": 1.1408429274065418e-06, + "logits/chosen": -1.1345255374908447, + "logits/rejected": -1.221318244934082, + "logps/chosen": -0.3457716703414917, + "logps/rejected": -1.645448088645935, + "loss": 0.3843, + "odds_ratio_loss": 0.146096870303154, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.017288584262132645, + "rewards/margins": 0.06498382240533829, + "rewards/rejected": -0.08227241039276123, + "sft_loss": 0.3457716703414917, + "step": 2680 + }, + { + "epoch": 2.148, + "grad_norm": 7.617489619432182, + "learning_rate": 1.1310919412686248e-06, + "logits/chosen": -0.6363608837127686, + "logits/rejected": -0.9788458943367004, + "logps/chosen": -0.31420546770095825, + "logps/rejected": -2.0080409049987793, + "loss": 0.4396, + "odds_ratio_loss": 0.08285339921712875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015710271894931793, + "rewards/margins": 0.08469177782535553, + "rewards/rejected": -0.10040205717086792, + "sft_loss": 0.31420546770095825, + "step": 2685 + }, + { + "epoch": 2.152, + "grad_norm": 5.306338004940538, + "learning_rate": 1.1213706079298566e-06, + "logits/chosen": -0.6837140917778015, + "logits/rejected": -1.3568503856658936, + "logps/chosen": -0.4734787046909332, + "logps/rejected": -1.4761061668395996, + "loss": 0.4861, + "odds_ratio_loss": 0.20139172673225403, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02367393672466278, + "rewards/margins": 0.050131380558013916, + "rewards/rejected": -0.0738053172826767, + "sft_loss": 0.4734787046909332, + "step": 2690 + }, + { + "epoch": 2.156, + "grad_norm": 5.238611883596988, + "learning_rate": 1.1116791379703032e-06, + "logits/chosen": -0.6437733769416809, + "logits/rejected": -0.8825413584709167, + "logps/chosen": -0.3527492880821228, + "logps/rejected": -2.517010450363159, + "loss": 0.4288, + "odds_ratio_loss": 0.10402262210845947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0176374651491642, + "rewards/margins": 0.10821305215358734, + "rewards/rejected": -0.12585052847862244, + "sft_loss": 0.3527492880821228, + "step": 2695 + }, + { + "epoch": 2.16, + "grad_norm": 5.037423172133621, + "learning_rate": 1.1020177413231334e-06, + "logits/chosen": -0.7074111700057983, + "logits/rejected": -1.1180449724197388, + "logps/chosen": -0.706173300743103, + "logps/rejected": -1.7855567932128906, + "loss": 0.5407, + "odds_ratio_loss": 0.2965608835220337, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03530866652727127, + "rewards/margins": 0.05396916717290878, + "rewards/rejected": -0.08927784115076065, + "sft_loss": 0.706173300743103, + "step": 2700 + }, + { + "epoch": 2.164, + "grad_norm": 11.625726843094434, + "learning_rate": 1.0923866272700845e-06, + "logits/chosen": -0.6679056286811829, + "logits/rejected": -0.8790310025215149, + "logps/chosen": -0.37520498037338257, + "logps/rejected": -1.725489854812622, + "loss": 0.4112, + "odds_ratio_loss": 0.1370328813791275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018760250881314278, + "rewards/margins": 0.06751424074172974, + "rewards/rejected": -0.08627448976039886, + "sft_loss": 0.37520498037338257, + "step": 2705 + }, + { + "epoch": 2.168, + "grad_norm": 5.751387431303456, + "learning_rate": 1.0827860044369226e-06, + "logits/chosen": -1.1543610095977783, + "logits/rejected": -1.1389491558074951, + "logps/chosen": -0.8079617619514465, + "logps/rejected": -1.881744384765625, + "loss": 0.653, + "odds_ratio_loss": 0.21621274948120117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04039808362722397, + "rewards/margins": 0.0536891333758831, + "rewards/rejected": -0.09408722817897797, + "sft_loss": 0.8079617619514465, + "step": 2710 + }, + { + "epoch": 2.172, + "grad_norm": 13.56821195989023, + "learning_rate": 1.073216080788921e-06, + "logits/chosen": -0.7350510358810425, + "logits/rejected": -0.9062551259994507, + "logps/chosen": -0.41547298431396484, + "logps/rejected": -1.4787991046905518, + "loss": 0.4839, + "odds_ratio_loss": 0.17242933809757233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02077365107834339, + "rewards/margins": 0.053166307508945465, + "rewards/rejected": -0.07393995672464371, + "sft_loss": 0.41547298431396484, + "step": 2715 + }, + { + "epoch": 2.176, + "grad_norm": 4.723019200073755, + "learning_rate": 1.06367706362636e-06, + "logits/chosen": -1.2677191495895386, + "logits/rejected": -1.3101189136505127, + "logps/chosen": -0.5442667603492737, + "logps/rejected": -1.3093074560165405, + "loss": 0.4889, + "odds_ratio_loss": 0.2581162452697754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027213340625166893, + "rewards/margins": 0.03825204446911812, + "rewards/rejected": -0.06546537578105927, + "sft_loss": 0.5442667603492737, + "step": 2720 + }, + { + "epoch": 2.18, + "grad_norm": 6.320991135595779, + "learning_rate": 1.0541691595800338e-06, + "logits/chosen": -0.7713707089424133, + "logits/rejected": -0.8052916526794434, + "logps/chosen": -0.8031466603279114, + "logps/rejected": -1.449881911277771, + "loss": 0.4879, + "odds_ratio_loss": 0.34610167145729065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04015733301639557, + "rewards/margins": 0.0323367640376091, + "rewards/rejected": -0.07249408960342407, + "sft_loss": 0.8031466603279114, + "step": 2725 + }, + { + "epoch": 2.184, + "grad_norm": 7.667541694530943, + "learning_rate": 1.0446925746067768e-06, + "logits/chosen": -0.735578179359436, + "logits/rejected": -1.290317416191101, + "logps/chosen": -0.4031465947628021, + "logps/rejected": -2.864250898361206, + "loss": 0.4889, + "odds_ratio_loss": 0.11299661546945572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020157333463430405, + "rewards/margins": 0.12305520474910736, + "rewards/rejected": -0.14321252703666687, + "sft_loss": 0.4031465947628021, + "step": 2730 + }, + { + "epoch": 2.188, + "grad_norm": 8.598577539363314, + "learning_rate": 1.0352475139849993e-06, + "logits/chosen": -1.1117980480194092, + "logits/rejected": -1.2077891826629639, + "logps/chosen": -0.6618624329566956, + "logps/rejected": -1.303166389465332, + "loss": 0.5357, + "odds_ratio_loss": 0.4117598533630371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03309311717748642, + "rewards/margins": 0.032065197825431824, + "rewards/rejected": -0.06515831500291824, + "sft_loss": 0.6618624329566956, + "step": 2735 + }, + { + "epoch": 2.192, + "grad_norm": 5.479651626100435, + "learning_rate": 1.0258341823102418e-06, + "logits/chosen": -0.873387336730957, + "logits/rejected": -1.0599385499954224, + "logps/chosen": -0.8911870121955872, + "logps/rejected": -1.9314569234848022, + "loss": 0.5608, + "odds_ratio_loss": 0.3484743535518646, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04455935209989548, + "rewards/margins": 0.052013497799634933, + "rewards/rejected": -0.09657285362482071, + "sft_loss": 0.8911870121955872, + "step": 2740 + }, + { + "epoch": 2.196, + "grad_norm": 6.711233602443305, + "learning_rate": 1.0164527834907468e-06, + "logits/chosen": -1.1073827743530273, + "logits/rejected": -0.8712652325630188, + "logps/chosen": -0.3501274287700653, + "logps/rejected": -1.0953724384307861, + "loss": 0.463, + "odds_ratio_loss": 0.2384282350540161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017506374046206474, + "rewards/margins": 0.03726224601268768, + "rewards/rejected": -0.05476861447095871, + "sft_loss": 0.3501274287700653, + "step": 2745 + }, + { + "epoch": 2.2, + "grad_norm": 6.323686554397781, + "learning_rate": 1.0071035207430352e-06, + "logits/chosen": -0.5682858228683472, + "logits/rejected": -1.2733685970306396, + "logps/chosen": -0.5779908895492554, + "logps/rejected": -1.2694551944732666, + "loss": 0.4752, + "odds_ratio_loss": 0.37383320927619934, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0288995448499918, + "rewards/margins": 0.03457321599125862, + "rewards/rejected": -0.06347276270389557, + "sft_loss": 0.5779908895492554, + "step": 2750 + }, + { + "epoch": 2.204, + "grad_norm": 5.639287689051844, + "learning_rate": 9.977865965875091e-07, + "logits/chosen": -0.7087677717208862, + "logits/rejected": -0.8449762463569641, + "logps/chosen": -0.5380462408065796, + "logps/rejected": -1.518744707107544, + "loss": 0.508, + "odds_ratio_loss": 0.23005056381225586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02690231241285801, + "rewards/margins": 0.049034927040338516, + "rewards/rejected": -0.07593724131584167, + "sft_loss": 0.5380462408065796, + "step": 2755 + }, + { + "epoch": 2.208, + "grad_norm": 6.305391688848771, + "learning_rate": 9.88502212844063e-07, + "logits/chosen": -0.7394359111785889, + "logits/rejected": -1.2294960021972656, + "logps/chosen": -0.6039851307868958, + "logps/rejected": -1.388856291770935, + "loss": 0.5097, + "odds_ratio_loss": 0.2779824733734131, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030199263244867325, + "rewards/margins": 0.039243556559085846, + "rewards/rejected": -0.06944280862808228, + "sft_loss": 0.6039851307868958, + "step": 2760 + }, + { + "epoch": 2.212, + "grad_norm": 5.741952540098281, + "learning_rate": 9.792505706277136e-07, + "logits/chosen": -0.8889573216438293, + "logits/rejected": -0.9258907437324524, + "logps/chosen": -0.5408663749694824, + "logps/rejected": -1.488094449043274, + "loss": 0.5131, + "odds_ratio_loss": 0.21949462592601776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02704331837594509, + "rewards/margins": 0.047361403703689575, + "rewards/rejected": -0.07440472394227982, + "sft_loss": 0.5408663749694824, + "step": 2765 + }, + { + "epoch": 2.216, + "grad_norm": 11.523508801713763, + "learning_rate": 9.700318703442437e-07, + "logits/chosen": -1.087032437324524, + "logits/rejected": -0.9947022199630737, + "logps/chosen": -0.47177377343177795, + "logps/rejected": -1.4707008600234985, + "loss": 0.4983, + "odds_ratio_loss": 0.2322104275226593, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02358868531882763, + "rewards/margins": 0.04994635283946991, + "rewards/rejected": -0.07353504002094269, + "sft_loss": 0.47177377343177795, + "step": 2770 + }, + { + "epoch": 2.22, + "grad_norm": 4.462198076439179, + "learning_rate": 9.608463116858544e-07, + "logits/chosen": -0.736533522605896, + "logits/rejected": -0.829948902130127, + "logps/chosen": -0.6742941737174988, + "logps/rejected": -1.4203133583068848, + "loss": 0.5412, + "odds_ratio_loss": 0.3509899079799652, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03371470421552658, + "rewards/margins": 0.037300966680049896, + "rewards/rejected": -0.07101567089557648, + "sft_loss": 0.6742941737174988, + "step": 2775 + }, + { + "epoch": 2.224, + "grad_norm": 4.228429825727303, + "learning_rate": 9.516940936268504e-07, + "logits/chosen": -1.1075184345245361, + "logits/rejected": -1.0531575679779053, + "logps/chosen": -0.21282517910003662, + "logps/rejected": -1.0983449220657349, + "loss": 0.4243, + "odds_ratio_loss": 0.24119925498962402, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.010641258209943771, + "rewards/margins": 0.04427598416805267, + "rewards/rejected": -0.05491724610328674, + "sft_loss": 0.21282517910003662, + "step": 2780 + }, + { + "epoch": 2.228, + "grad_norm": 5.45480141160989, + "learning_rate": 9.4257541441932e-07, + "logits/chosen": -0.9974383115768433, + "logits/rejected": -1.3215069770812988, + "logps/chosen": -0.42205625772476196, + "logps/rejected": -1.7309017181396484, + "loss": 0.3804, + "odds_ratio_loss": 0.16683904826641083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02110280841588974, + "rewards/margins": 0.0654422789812088, + "rewards/rejected": -0.08654508739709854, + "sft_loss": 0.42205625772476196, + "step": 2785 + }, + { + "epoch": 2.232, + "grad_norm": 9.178748455728739, + "learning_rate": 9.334904715888496e-07, + "logits/chosen": -1.0844497680664062, + "logits/rejected": -1.070488691329956, + "logps/chosen": -0.32815462350845337, + "logps/rejected": -1.7352529764175415, + "loss": 0.4216, + "odds_ratio_loss": 0.09935733675956726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016407731920480728, + "rewards/margins": 0.07035491615533829, + "rewards/rejected": -0.08676265180110931, + "sft_loss": 0.32815462350845337, + "step": 2790 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 4.558311798079199, + "learning_rate": 9.244394619302338e-07, + "logits/chosen": -0.9466593861579895, + "logits/rejected": -0.8899604678153992, + "logps/chosen": -0.7910462617874146, + "logps/rejected": -1.691467523574829, + "loss": 0.5247, + "odds_ratio_loss": 0.33903276920318604, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03955231234431267, + "rewards/margins": 0.04502106085419655, + "rewards/rejected": -0.08457337319850922, + "sft_loss": 0.7910462617874146, + "step": 2795 + }, + { + "epoch": 2.24, + "grad_norm": 5.935563711192816, + "learning_rate": 9.154225815032242e-07, + "logits/chosen": -0.7447048425674438, + "logits/rejected": -1.309199333190918, + "logps/chosen": -0.4879288673400879, + "logps/rejected": -1.4591110944747925, + "loss": 0.5058, + "odds_ratio_loss": 0.22252912819385529, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.024396440014243126, + "rewards/margins": 0.04855911061167717, + "rewards/rejected": -0.07295555621385574, + "sft_loss": 0.4879288673400879, + "step": 2800 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 5.083831381993147, + "learning_rate": 9.064400256282757e-07, + "logits/chosen": -0.4584059715270996, + "logits/rejected": -0.9872250556945801, + "logps/chosen": -0.5474129915237427, + "logps/rejected": -1.5436441898345947, + "loss": 0.4659, + "odds_ratio_loss": 0.20902732014656067, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.027370650321245193, + "rewards/margins": 0.049811553210020065, + "rewards/rejected": -0.07718220353126526, + "sft_loss": 0.5474129915237427, + "step": 2805 + }, + { + "epoch": 2.248, + "grad_norm": 6.37725598665606, + "learning_rate": 8.974919888823164e-07, + "logits/chosen": -1.1903154850006104, + "logits/rejected": -0.9519041180610657, + "logps/chosen": -0.407247930765152, + "logps/rejected": -1.9125579595565796, + "loss": 0.4918, + "odds_ratio_loss": 0.17733065783977509, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02036239765584469, + "rewards/margins": 0.07526550441980362, + "rewards/rejected": -0.09562790393829346, + "sft_loss": 0.407247930765152, + "step": 2810 + }, + { + "epoch": 2.252, + "grad_norm": 23.791980041949664, + "learning_rate": 8.885786650945333e-07, + "logits/chosen": -0.9534207582473755, + "logits/rejected": -0.8568700551986694, + "logps/chosen": -0.4399814009666443, + "logps/rejected": -1.6407368183135986, + "loss": 0.4943, + "odds_ratio_loss": 0.2155378758907318, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.021999072283506393, + "rewards/margins": 0.0600377693772316, + "rewards/rejected": -0.08203683793544769, + "sft_loss": 0.4399814009666443, + "step": 2815 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 9.509659444717578, + "learning_rate": 8.797002473421729e-07, + "logits/chosen": -0.47383326292037964, + "logits/rejected": -0.8560094833374023, + "logps/chosen": -0.532326340675354, + "logps/rejected": -1.574268102645874, + "loss": 0.4286, + "odds_ratio_loss": 0.2277209758758545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02661631628870964, + "rewards/margins": 0.05209709331393242, + "rewards/rejected": -0.07871340215206146, + "sft_loss": 0.532326340675354, + "step": 2820 + }, + { + "epoch": 2.26, + "grad_norm": 5.1984524732684765, + "learning_rate": 8.708569279463622e-07, + "logits/chosen": -0.8231816291809082, + "logits/rejected": -1.3245340585708618, + "logps/chosen": -0.7534009218215942, + "logps/rejected": -1.5175855159759521, + "loss": 0.4659, + "odds_ratio_loss": 0.22750845551490784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03767004236578941, + "rewards/margins": 0.03820923715829849, + "rewards/rejected": -0.0758792832493782, + "sft_loss": 0.7534009218215942, + "step": 2825 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 5.302329927455291, + "learning_rate": 8.620488984679378e-07, + "logits/chosen": -1.174373984336853, + "logits/rejected": -0.9231562614440918, + "logps/chosen": -0.49920812249183655, + "logps/rejected": -1.5488044023513794, + "loss": 0.4205, + "odds_ratio_loss": 0.1963232159614563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024960406124591827, + "rewards/margins": 0.0524798147380352, + "rewards/rejected": -0.07744021713733673, + "sft_loss": 0.49920812249183655, + "step": 2830 + }, + { + "epoch": 2.268, + "grad_norm": 8.784170043250743, + "learning_rate": 8.532763497032987e-07, + "logits/chosen": -0.7181037068367004, + "logits/rejected": -1.2467901706695557, + "logps/chosen": -0.5864667892456055, + "logps/rejected": -1.448925256729126, + "loss": 0.5551, + "odds_ratio_loss": 0.26310136914253235, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.029323343187570572, + "rewards/margins": 0.043122924864292145, + "rewards/rejected": -0.07244626432657242, + "sft_loss": 0.5864667892456055, + "step": 2835 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 7.046247884710958, + "learning_rate": 8.445394716802754e-07, + "logits/chosen": -0.8855420351028442, + "logits/rejected": -1.3163471221923828, + "logps/chosen": -0.4142325818538666, + "logps/rejected": -1.6013071537017822, + "loss": 0.468, + "odds_ratio_loss": 0.14958259463310242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020711630582809448, + "rewards/margins": 0.05935372784733772, + "rewards/rejected": -0.08006535470485687, + "sft_loss": 0.4142325818538666, + "step": 2840 + }, + { + "epoch": 2.276, + "grad_norm": 5.056482274476605, + "learning_rate": 8.35838453654009e-07, + "logits/chosen": -0.7100471258163452, + "logits/rejected": -0.6780123114585876, + "logps/chosen": -0.6010003685951233, + "logps/rejected": -1.8489612340927124, + "loss": 0.4035, + "odds_ratio_loss": 0.2834250032901764, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.030050018802285194, + "rewards/margins": 0.0623980388045311, + "rewards/rejected": -0.09244807064533234, + "sft_loss": 0.6010003685951233, + "step": 2845 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 5.899418760006874, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": -0.8226686716079712, + "logits/rejected": -1.2473328113555908, + "logps/chosen": -0.7500003576278687, + "logps/rejected": -1.7512264251708984, + "loss": 0.4737, + "odds_ratio_loss": 0.24083253741264343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03750002384185791, + "rewards/margins": 0.05006130784749985, + "rewards/rejected": -0.08756133168935776, + "sft_loss": 0.7500003576278687, + "step": 2850 + }, + { + "epoch": 2.284, + "grad_norm": 6.588276147321622, + "learning_rate": 8.185447507243e-07, + "logits/chosen": -0.8183411359786987, + "logits/rejected": -0.9986063838005066, + "logps/chosen": -0.5513178110122681, + "logps/rejected": -1.8742460012435913, + "loss": 0.5713, + "odds_ratio_loss": 0.18965277075767517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027565892785787582, + "rewards/margins": 0.06614641845226288, + "rewards/rejected": -0.09371231496334076, + "sft_loss": 0.5513178110122681, + "step": 2855 + }, + { + "epoch": 2.288, + "grad_norm": 4.435394123744806, + "learning_rate": 8.099524404308948e-07, + "logits/chosen": -0.8991169929504395, + "logits/rejected": -0.829195499420166, + "logps/chosen": -0.4634367525577545, + "logps/rejected": -1.410571813583374, + "loss": 0.4853, + "odds_ratio_loss": 0.23965725302696228, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.023171838372945786, + "rewards/margins": 0.047356750816106796, + "rewards/rejected": -0.07052858918905258, + "sft_loss": 0.4634367525577545, + "step": 2860 + }, + { + "epoch": 2.292, + "grad_norm": 5.612134177673955, + "learning_rate": 8.013967393462094e-07, + "logits/chosen": -0.6299402713775635, + "logits/rejected": -0.7464720010757446, + "logps/chosen": -0.24951812624931335, + "logps/rejected": -1.7939624786376953, + "loss": 0.4291, + "odds_ratio_loss": 0.07172398269176483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012475905939936638, + "rewards/margins": 0.07722222059965134, + "rewards/rejected": -0.08969812840223312, + "sft_loss": 0.24951812624931335, + "step": 2865 + }, + { + "epoch": 2.296, + "grad_norm": 5.034253606387231, + "learning_rate": 7.928778328007918e-07, + "logits/chosen": -0.8041081428527832, + "logits/rejected": -0.8288570642471313, + "logps/chosen": -0.4491206109523773, + "logps/rejected": -1.529018521308899, + "loss": 0.4568, + "odds_ratio_loss": 0.1833478957414627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022456031292676926, + "rewards/margins": 0.05399489402770996, + "rewards/rejected": -0.07645092159509659, + "sft_loss": 0.4491206109523773, + "step": 2870 + }, + { + "epoch": 2.3, + "grad_norm": 4.4545637866951004, + "learning_rate": 7.843959053281663e-07, + "logits/chosen": -0.6113277673721313, + "logits/rejected": -1.2742953300476074, + "logps/chosen": -0.3947676420211792, + "logps/rejected": -1.7495063543319702, + "loss": 0.4967, + "odds_ratio_loss": 0.12505455315113068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01973838172852993, + "rewards/margins": 0.06773693859577179, + "rewards/rejected": -0.08747532218694687, + "sft_loss": 0.3947676420211792, + "step": 2875 + }, + { + "epoch": 2.304, + "grad_norm": 4.327743928984781, + "learning_rate": 7.759511406608255e-07, + "logits/chosen": -0.8660491108894348, + "logits/rejected": -0.9698010683059692, + "logps/chosen": -0.4570358395576477, + "logps/rejected": -1.4516005516052246, + "loss": 0.4305, + "odds_ratio_loss": 0.17015303671360016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022851794958114624, + "rewards/margins": 0.049728237092494965, + "rewards/rejected": -0.07258002460002899, + "sft_loss": 0.4570358395576477, + "step": 2880 + }, + { + "epoch": 2.308, + "grad_norm": 4.831001325484035, + "learning_rate": 7.675437217262571e-07, + "logits/chosen": -0.9820590019226074, + "logits/rejected": -1.3118226528167725, + "logps/chosen": -0.6205871105194092, + "logps/rejected": -1.3952069282531738, + "loss": 0.5001, + "odds_ratio_loss": 0.38392138481140137, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0310293547809124, + "rewards/margins": 0.038730986416339874, + "rewards/rejected": -0.06976033747196198, + "sft_loss": 0.6205871105194092, + "step": 2885 + }, + { + "epoch": 2.312, + "grad_norm": 12.399106444555985, + "learning_rate": 7.591738306429769e-07, + "logits/chosen": -0.8063400387763977, + "logits/rejected": -0.97333824634552, + "logps/chosen": -0.38419461250305176, + "logps/rejected": -1.590563178062439, + "loss": 0.5036, + "odds_ratio_loss": 0.1262492537498474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019209731370210648, + "rewards/margins": 0.06031842902302742, + "rewards/rejected": -0.07952816039323807, + "sft_loss": 0.38419461250305176, + "step": 2890 + }, + { + "epoch": 2.316, + "grad_norm": 5.421465099897958, + "learning_rate": 7.508416487165862e-07, + "logits/chosen": -0.9876044392585754, + "logits/rejected": -0.8423392176628113, + "logps/chosen": -0.2360585480928421, + "logps/rejected": -1.1289315223693848, + "loss": 0.3904, + "odds_ratio_loss": 0.1510620415210724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01180292759090662, + "rewards/margins": 0.04464365169405937, + "rewards/rejected": -0.056446582078933716, + "sft_loss": 0.2360585480928421, + "step": 2895 + }, + { + "epoch": 2.32, + "grad_norm": 3.870106030569181, + "learning_rate": 7.425473564358457e-07, + "logits/chosen": -0.6248672604560852, + "logits/rejected": -1.0867903232574463, + "logps/chosen": -0.4465237557888031, + "logps/rejected": -1.3280917406082153, + "loss": 0.4823, + "odds_ratio_loss": 0.23933851718902588, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.022326190024614334, + "rewards/margins": 0.04407840222120285, + "rewards/rejected": -0.06640458852052689, + "sft_loss": 0.4465237557888031, + "step": 2900 + }, + { + "epoch": 2.324, + "grad_norm": 6.574323626048454, + "learning_rate": 7.342911334687619e-07, + "logits/chosen": -0.8632572293281555, + "logits/rejected": -1.020462989807129, + "logps/chosen": -0.429226815700531, + "logps/rejected": -1.3181707859039307, + "loss": 0.488, + "odds_ratio_loss": 0.21809737384319305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02146134153008461, + "rewards/margins": 0.044447191059589386, + "rewards/rejected": -0.0659085363149643, + "sft_loss": 0.429226815700531, + "step": 2905 + }, + { + "epoch": 2.328, + "grad_norm": 6.6512572207835525, + "learning_rate": 7.260731586586983e-07, + "logits/chosen": -0.9791855812072754, + "logits/rejected": -0.8407464027404785, + "logps/chosen": -0.16527701914310455, + "logps/rejected": -1.412431001663208, + "loss": 0.4797, + "odds_ratio_loss": 0.13517996668815613, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.008263850584626198, + "rewards/margins": 0.06235770136117935, + "rewards/rejected": -0.070621557533741, + "sft_loss": 0.16527701914310455, + "step": 2910 + }, + { + "epoch": 2.332, + "grad_norm": 8.3662208877826, + "learning_rate": 7.178936100204994e-07, + "logits/chosen": -0.7169278860092163, + "logits/rejected": -1.1434614658355713, + "logps/chosen": -0.619706392288208, + "logps/rejected": -1.5916235446929932, + "loss": 0.5525, + "odds_ratio_loss": 0.2509163022041321, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03098532184958458, + "rewards/margins": 0.0485958568751812, + "rewards/rejected": -0.07958117872476578, + "sft_loss": 0.619706392288208, + "step": 2915 + }, + { + "epoch": 2.336, + "grad_norm": 21.850965502390842, + "learning_rate": 7.097526647366379e-07, + "logits/chosen": -0.9213175773620605, + "logits/rejected": -0.9148879051208496, + "logps/chosen": -0.4506239891052246, + "logps/rejected": -1.790827751159668, + "loss": 0.5104, + "odds_ratio_loss": 0.1461685597896576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02253119833767414, + "rewards/margins": 0.06701019406318665, + "rewards/rejected": -0.08954139798879623, + "sft_loss": 0.4506239891052246, + "step": 2920 + }, + { + "epoch": 2.34, + "grad_norm": 4.021479030561525, + "learning_rate": 7.016504991533727e-07, + "logits/chosen": -0.9241275787353516, + "logits/rejected": -1.168180227279663, + "logps/chosen": -0.34740298986434937, + "logps/rejected": -1.5019714832305908, + "loss": 0.3721, + "odds_ratio_loss": 0.15549840033054352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017370149493217468, + "rewards/margins": 0.057728420943021774, + "rewards/rejected": -0.07509858161211014, + "sft_loss": 0.34740298986434937, + "step": 2925 + }, + { + "epoch": 2.344, + "grad_norm": 5.884610068850595, + "learning_rate": 6.935872887769299e-07, + "logits/chosen": -0.7997936010360718, + "logits/rejected": -0.9736310839653015, + "logps/chosen": -0.5550605058670044, + "logps/rejected": -1.6123679876327515, + "loss": 0.4905, + "odds_ratio_loss": 0.2296057492494583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027753029018640518, + "rewards/margins": 0.05286537855863571, + "rewards/rejected": -0.08061840385198593, + "sft_loss": 0.5550605058670044, + "step": 2930 + }, + { + "epoch": 2.348, + "grad_norm": 4.469717951127329, + "learning_rate": 6.855632082697045e-07, + "logits/chosen": -0.8227846026420593, + "logits/rejected": -1.05718195438385, + "logps/chosen": -0.5576878786087036, + "logps/rejected": -1.503408670425415, + "loss": 0.5521, + "odds_ratio_loss": 0.23267917335033417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02788439765572548, + "rewards/margins": 0.04728604108095169, + "rewards/rejected": -0.07517042756080627, + "sft_loss": 0.5576878786087036, + "step": 2935 + }, + { + "epoch": 2.352, + "grad_norm": 4.357069152595027, + "learning_rate": 6.775784314464717e-07, + "logits/chosen": -0.7742995023727417, + "logits/rejected": -0.7836570143699646, + "logps/chosen": -0.6989033222198486, + "logps/rejected": -1.754209280014038, + "loss": 0.4823, + "odds_ratio_loss": 0.251584529876709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03494516760110855, + "rewards/margins": 0.05276529863476753, + "rewards/rejected": -0.08771046996116638, + "sft_loss": 0.6989033222198486, + "step": 2940 + }, + { + "epoch": 2.356, + "grad_norm": 5.097226495202278, + "learning_rate": 6.696331312706245e-07, + "logits/chosen": -0.8427531123161316, + "logits/rejected": -0.6410783529281616, + "logps/chosen": -0.4761292338371277, + "logps/rejected": -1.3721054792404175, + "loss": 0.4982, + "odds_ratio_loss": 0.25603288412094116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023806463927030563, + "rewards/margins": 0.04479881748557091, + "rewards/rejected": -0.06860526651144028, + "sft_loss": 0.4761292338371277, + "step": 2945 + }, + { + "epoch": 2.36, + "grad_norm": 9.533319054768027, + "learning_rate": 6.617274798504286e-07, + "logits/chosen": -1.0047314167022705, + "logits/rejected": -1.1931557655334473, + "logps/chosen": -0.30853766202926636, + "logps/rejected": -1.573937177658081, + "loss": 0.4144, + "odds_ratio_loss": 0.1045946478843689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015426883473992348, + "rewards/margins": 0.0632699728012085, + "rewards/rejected": -0.07869686186313629, + "sft_loss": 0.30853766202926636, + "step": 2950 + }, + { + "epoch": 2.364, + "grad_norm": 5.794627406886997, + "learning_rate": 6.538616484352902e-07, + "logits/chosen": -0.6078363656997681, + "logits/rejected": -0.9947171211242676, + "logps/chosen": -0.5081497430801392, + "logps/rejected": -1.6594918966293335, + "loss": 0.4509, + "odds_ratio_loss": 0.18001045286655426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025407487526535988, + "rewards/margins": 0.057567108422517776, + "rewards/rejected": -0.08297459036111832, + "sft_loss": 0.5081497430801392, + "step": 2955 + }, + { + "epoch": 2.368, + "grad_norm": 8.384862451811653, + "learning_rate": 6.460358074120518e-07, + "logits/chosen": -1.0173108577728271, + "logits/rejected": -0.7680644392967224, + "logps/chosen": -0.2868219017982483, + "logps/rejected": -1.2532503604888916, + "loss": 0.4436, + "odds_ratio_loss": 0.1625426560640335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014341096393764019, + "rewards/margins": 0.04832141846418381, + "rewards/rejected": -0.0626625195145607, + "sft_loss": 0.2868219017982483, + "step": 2960 + }, + { + "epoch": 2.372, + "grad_norm": 5.987701383797447, + "learning_rate": 6.382501263012936e-07, + "logits/chosen": -0.39140084385871887, + "logits/rejected": -1.0467764139175415, + "logps/chosen": -0.48364967107772827, + "logps/rejected": -1.6967146396636963, + "loss": 0.4764, + "odds_ratio_loss": 0.14926201105117798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024182487279176712, + "rewards/margins": 0.06065324693918228, + "rewards/rejected": -0.0848357304930687, + "sft_loss": 0.48364967107772827, + "step": 2965 + }, + { + "epoch": 2.376, + "grad_norm": 5.365533722525087, + "learning_rate": 6.305047737536707e-07, + "logits/chosen": -1.085931658744812, + "logits/rejected": -0.8201769590377808, + "logps/chosen": -0.31420257687568665, + "logps/rejected": -1.8399406671524048, + "loss": 0.4935, + "odds_ratio_loss": 0.16777533292770386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015710126608610153, + "rewards/margins": 0.07628689706325531, + "rewards/rejected": -0.09199702739715576, + "sft_loss": 0.31420257687568665, + "step": 2970 + }, + { + "epoch": 2.38, + "grad_norm": 4.187858613296553, + "learning_rate": 6.227999175462521e-07, + "logits/chosen": -1.0166877508163452, + "logits/rejected": -1.3475017547607422, + "logps/chosen": -0.45567312836647034, + "logps/rejected": -1.3064647912979126, + "loss": 0.4065, + "odds_ratio_loss": 0.20544198155403137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022783655673265457, + "rewards/margins": 0.0425395742058754, + "rewards/rejected": -0.06532323360443115, + "sft_loss": 0.45567312836647034, + "step": 2975 + }, + { + "epoch": 2.384, + "grad_norm": 5.441360920553305, + "learning_rate": 6.151357245788917e-07, + "logits/chosen": -0.4649627208709717, + "logits/rejected": -0.7259455323219299, + "logps/chosen": -0.32298606634140015, + "logps/rejected": -1.7776187658309937, + "loss": 0.421, + "odds_ratio_loss": 0.07744672149419785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016149302944540977, + "rewards/margins": 0.0727316364645958, + "rewards/rejected": -0.08888094127178192, + "sft_loss": 0.32298606634140015, + "step": 2980 + }, + { + "epoch": 2.388, + "grad_norm": 6.473770551803507, + "learning_rate": 6.075123608706093e-07, + "logits/chosen": -0.9025441408157349, + "logits/rejected": -0.8097003698348999, + "logps/chosen": -0.30892783403396606, + "logps/rejected": -1.4229497909545898, + "loss": 0.4896, + "odds_ratio_loss": 0.18760545551776886, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.015446392819285393, + "rewards/margins": 0.05570109561085701, + "rewards/rejected": -0.07114748656749725, + "sft_loss": 0.30892783403396606, + "step": 2985 + }, + { + "epoch": 2.392, + "grad_norm": 7.045700421865737, + "learning_rate": 5.999299915559956e-07, + "logits/chosen": -0.8020213842391968, + "logits/rejected": -0.9755544662475586, + "logps/chosen": -0.3655335307121277, + "logps/rejected": -1.7111154794692993, + "loss": 0.4826, + "odds_ratio_loss": 0.1401543915271759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018276674672961235, + "rewards/margins": 0.06727910041809082, + "rewards/rejected": -0.08555576950311661, + "sft_loss": 0.3655335307121277, + "step": 2990 + }, + { + "epoch": 2.396, + "grad_norm": 5.1559133299959345, + "learning_rate": 5.923887808816373e-07, + "logits/chosen": -0.7541839480400085, + "logits/rejected": -0.8241696357727051, + "logps/chosen": -0.32986879348754883, + "logps/rejected": -1.7209409475326538, + "loss": 0.4559, + "odds_ratio_loss": 0.11603020131587982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01649343967437744, + "rewards/margins": 0.06955362856388092, + "rewards/rejected": -0.08604706078767776, + "sft_loss": 0.32986879348754883, + "step": 2995 + }, + { + "epoch": 2.4, + "grad_norm": 5.5494435865172695, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -0.604158341884613, + "logits/rejected": -1.010777473449707, + "logps/chosen": -0.18639525771141052, + "logps/rejected": -1.7561891078948975, + "loss": 0.4748, + "odds_ratio_loss": 0.08248431980609894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009319763630628586, + "rewards/margins": 0.07848969846963882, + "rewards/rejected": -0.08780945837497711, + "sft_loss": 0.18639525771141052, + "step": 3000 + }, + { + "epoch": 2.404, + "grad_norm": 7.404224976766671, + "learning_rate": 5.774304879786688e-07, + "logits/chosen": -0.8347930908203125, + "logits/rejected": -1.0326844453811646, + "logps/chosen": -0.4041759967803955, + "logps/rejected": -1.3478691577911377, + "loss": 0.4676, + "odds_ratio_loss": 0.1940927505493164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020208800211548805, + "rewards/margins": 0.04718465730547905, + "rewards/rejected": -0.067393459379673, + "sft_loss": 0.4041759967803955, + "step": 3005 + }, + { + "epoch": 2.408, + "grad_norm": 6.3948612577090005, + "learning_rate": 5.700137297712749e-07, + "logits/chosen": -1.0781855583190918, + "logits/rejected": -0.7031400799751282, + "logps/chosen": -0.3962685465812683, + "logps/rejected": -1.2765676975250244, + "loss": 0.5032, + "odds_ratio_loss": 0.2406892478466034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019813427701592445, + "rewards/margins": 0.04401496425271034, + "rewards/rejected": -0.06382839381694794, + "sft_loss": 0.3962685465812683, + "step": 3010 + }, + { + "epoch": 2.412, + "grad_norm": 5.487144565387309, + "learning_rate": 5.626387782395512e-07, + "logits/chosen": -0.8737370371818542, + "logits/rejected": -1.1139321327209473, + "logps/chosen": -0.6051273345947266, + "logps/rejected": -1.6024173498153687, + "loss": 0.4723, + "odds_ratio_loss": 0.23752835392951965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0302563663572073, + "rewards/margins": 0.0498645082116127, + "rewards/rejected": -0.08012087643146515, + "sft_loss": 0.6051273345947266, + "step": 3015 + }, + { + "epoch": 2.416, + "grad_norm": 6.210678255713737, + "learning_rate": 5.553057931370729e-07, + "logits/chosen": -0.9348379373550415, + "logits/rejected": -0.7690949440002441, + "logps/chosen": -0.6480275392532349, + "logps/rejected": -1.3571795225143433, + "loss": 0.5108, + "odds_ratio_loss": 0.2786480188369751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032401375472545624, + "rewards/margins": 0.03545759990811348, + "rewards/rejected": -0.0678589791059494, + "sft_loss": 0.6480275392532349, + "step": 3020 + }, + { + "epoch": 2.42, + "grad_norm": 4.9274946658912615, + "learning_rate": 5.48014933308352e-07, + "logits/chosen": -0.7441031336784363, + "logits/rejected": -1.0764060020446777, + "logps/chosen": -0.38686519861221313, + "logps/rejected": -1.4046424627304077, + "loss": 0.5403, + "odds_ratio_loss": 0.14413750171661377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019343260675668716, + "rewards/margins": 0.05088886618614197, + "rewards/rejected": -0.07023213058710098, + "sft_loss": 0.38686519861221313, + "step": 3025 + }, + { + "epoch": 2.424, + "grad_norm": 4.697225271080028, + "learning_rate": 5.407663566854008e-07, + "logits/chosen": -0.9233636856079102, + "logits/rejected": -0.7391080260276794, + "logps/chosen": -0.3096584677696228, + "logps/rejected": -1.552175760269165, + "loss": 0.5123, + "odds_ratio_loss": 0.2237163782119751, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.015482926741242409, + "rewards/margins": 0.062125854194164276, + "rewards/rejected": -0.07760877907276154, + "sft_loss": 0.3096584677696228, + "step": 3030 + }, + { + "epoch": 2.428, + "grad_norm": 5.079507350370886, + "learning_rate": 5.335602202843054e-07, + "logits/chosen": -0.7503083944320679, + "logits/rejected": -0.8697785139083862, + "logps/chosen": -0.5030153393745422, + "logps/rejected": -1.383043885231018, + "loss": 0.4996, + "odds_ratio_loss": 0.20985543727874756, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025150766596198082, + "rewards/margins": 0.04400142282247543, + "rewards/rejected": -0.06915219128131866, + "sft_loss": 0.5030153393745422, + "step": 3035 + }, + { + "epoch": 2.432, + "grad_norm": 5.622928495063832, + "learning_rate": 5.263966802018275e-07, + "logits/chosen": -0.8630277514457703, + "logits/rejected": -0.9782209396362305, + "logps/chosen": -0.364883691072464, + "logps/rejected": -3.0828795433044434, + "loss": 0.3849, + "odds_ratio_loss": 0.11438252776861191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01824418641626835, + "rewards/margins": 0.13589979708194733, + "rewards/rejected": -0.15414398908615112, + "sft_loss": 0.364883691072464, + "step": 3040 + }, + { + "epoch": 2.436, + "grad_norm": 6.518609406323272, + "learning_rate": 5.192758916120236e-07, + "logits/chosen": -0.720903754234314, + "logits/rejected": -0.9428263902664185, + "logps/chosen": -0.44252529740333557, + "logps/rejected": -1.551532506942749, + "loss": 0.5285, + "odds_ratio_loss": 0.19013534486293793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022126266732811928, + "rewards/margins": 0.05545036867260933, + "rewards/rejected": -0.0775766372680664, + "sft_loss": 0.44252529740333557, + "step": 3045 + }, + { + "epoch": 2.44, + "grad_norm": 6.898029710003963, + "learning_rate": 5.121980087628802e-07, + "logits/chosen": -1.102452039718628, + "logits/rejected": -1.1401493549346924, + "logps/chosen": -0.6420519351959229, + "logps/rejected": -1.6801929473876953, + "loss": 0.491, + "odds_ratio_loss": 0.22621390223503113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03210259601473808, + "rewards/margins": 0.05190705135464668, + "rewards/rejected": -0.08400964736938477, + "sft_loss": 0.6420519351959229, + "step": 3050 + }, + { + "epoch": 2.444, + "grad_norm": 7.428665380081318, + "learning_rate": 5.051631849729785e-07, + "logits/chosen": -0.7611395120620728, + "logits/rejected": -0.9640461802482605, + "logps/chosen": -0.7123726606369019, + "logps/rejected": -1.2486783266067505, + "loss": 0.4901, + "odds_ratio_loss": 0.40984076261520386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03561863675713539, + "rewards/margins": 0.026815274730324745, + "rewards/rejected": -0.06243390962481499, + "sft_loss": 0.7123726606369019, + "step": 3055 + }, + { + "epoch": 2.448, + "grad_norm": 4.15877124152584, + "learning_rate": 4.981715726281666e-07, + "logits/chosen": -0.8188239336013794, + "logits/rejected": -1.3134756088256836, + "logps/chosen": -0.42243289947509766, + "logps/rejected": -1.7504783868789673, + "loss": 0.4158, + "odds_ratio_loss": 0.14152325689792633, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.021121647208929062, + "rewards/margins": 0.06640227138996124, + "rewards/rejected": -0.08752389997243881, + "sft_loss": 0.42243289947509766, + "step": 3060 + }, + { + "epoch": 2.452, + "grad_norm": 4.5394562483362435, + "learning_rate": 4.912233231782623e-07, + "logits/chosen": -0.844011127948761, + "logits/rejected": -0.77781081199646, + "logps/chosen": -0.5193901658058167, + "logps/rejected": -1.5341647863388062, + "loss": 0.5074, + "odds_ratio_loss": 0.34647947549819946, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02596951089799404, + "rewards/margins": 0.050738729536533356, + "rewards/rejected": -0.07670824229717255, + "sft_loss": 0.5193901658058167, + "step": 3065 + }, + { + "epoch": 2.456, + "grad_norm": 5.9484966146690885, + "learning_rate": 4.843185871337722e-07, + "logits/chosen": -1.1236478090286255, + "logits/rejected": -1.187819242477417, + "logps/chosen": -0.3840157687664032, + "logps/rejected": -1.7428499460220337, + "loss": 0.5018, + "odds_ratio_loss": 0.13463526964187622, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01920078694820404, + "rewards/margins": 0.06794170290231705, + "rewards/rejected": -0.08714248239994049, + "sft_loss": 0.3840157687664032, + "step": 3070 + }, + { + "epoch": 2.46, + "grad_norm": 5.253460866623653, + "learning_rate": 4.774575140626317e-07, + "logits/chosen": -0.7584226131439209, + "logits/rejected": -0.8733755350112915, + "logps/chosen": -0.6464089155197144, + "logps/rejected": -1.525433897972107, + "loss": 0.4441, + "odds_ratio_loss": 0.25305792689323425, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03232044726610184, + "rewards/margins": 0.04395125061273575, + "rewards/rejected": -0.07627169787883759, + "sft_loss": 0.6464089155197144, + "step": 3075 + }, + { + "epoch": 2.464, + "grad_norm": 12.350993791450163, + "learning_rate": 4.706402525869633e-07, + "logits/chosen": -0.7759664058685303, + "logits/rejected": -0.9143487811088562, + "logps/chosen": -0.3908182978630066, + "logps/rejected": -1.8124420642852783, + "loss": 0.4794, + "odds_ratio_loss": 0.12326017767190933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01954091526567936, + "rewards/margins": 0.07108117640018463, + "rewards/rejected": -0.09062208980321884, + "sft_loss": 0.3908182978630066, + "step": 3080 + }, + { + "epoch": 2.468, + "grad_norm": 14.144313900187102, + "learning_rate": 4.638669503798579e-07, + "logits/chosen": -0.8764970898628235, + "logits/rejected": -0.9682470560073853, + "logps/chosen": -0.3992885947227478, + "logps/rejected": -1.7859230041503906, + "loss": 0.5238, + "odds_ratio_loss": 0.14603550732135773, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0199644286185503, + "rewards/margins": 0.06933172792196274, + "rewards/rejected": -0.08929616957902908, + "sft_loss": 0.3992885947227478, + "step": 3085 + }, + { + "epoch": 2.472, + "grad_norm": 4.717707133780473, + "learning_rate": 4.5713775416217884e-07, + "logits/chosen": -0.7459074854850769, + "logits/rejected": -0.9472630620002747, + "logps/chosen": -0.3936541676521301, + "logps/rejected": -1.3656264543533325, + "loss": 0.4566, + "odds_ratio_loss": 0.1686370074748993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019682709127664566, + "rewards/margins": 0.04859861359000206, + "rewards/rejected": -0.06828131526708603, + "sft_loss": 0.3936541676521301, + "step": 3090 + }, + { + "epoch": 2.476, + "grad_norm": 6.069217496310175, + "learning_rate": 4.5045280969937847e-07, + "logits/chosen": -0.7254356145858765, + "logits/rejected": -1.232452154159546, + "logps/chosen": -0.5967345833778381, + "logps/rejected": -1.7667179107666016, + "loss": 0.4779, + "odds_ratio_loss": 0.31498411297798157, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.029836729168891907, + "rewards/margins": 0.05849916860461235, + "rewards/rejected": -0.08833589404821396, + "sft_loss": 0.5967345833778381, + "step": 3095 + }, + { + "epoch": 2.48, + "grad_norm": 6.104980238131941, + "learning_rate": 4.438122617983442e-07, + "logits/chosen": -0.8972131609916687, + "logits/rejected": -0.9765474200248718, + "logps/chosen": -0.42376136779785156, + "logps/rejected": -1.4339518547058105, + "loss": 0.4595, + "odds_ratio_loss": 0.17065298557281494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02118806727230549, + "rewards/margins": 0.05050952360033989, + "rewards/rejected": -0.07169759273529053, + "sft_loss": 0.42376136779785156, + "step": 3100 + }, + { + "epoch": 2.484, + "grad_norm": 6.781146830921427, + "learning_rate": 4.372162543042624e-07, + "logits/chosen": -0.8027432560920715, + "logits/rejected": -0.9535185694694519, + "logps/chosen": -0.5751426219940186, + "logps/rejected": -1.7298011779785156, + "loss": 0.443, + "odds_ratio_loss": 0.19305340945720673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02875712886452675, + "rewards/margins": 0.05773293972015381, + "rewards/rejected": -0.08649007230997086, + "sft_loss": 0.5751426219940186, + "step": 3105 + }, + { + "epoch": 2.488, + "grad_norm": 5.981163589398859, + "learning_rate": 4.3066493009749853e-07, + "logits/chosen": -0.6171215772628784, + "logits/rejected": -0.874431312084198, + "logps/chosen": -0.6073184609413147, + "logps/rejected": -1.8347069025039673, + "loss": 0.3958, + "odds_ratio_loss": 0.24824929237365723, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030365925282239914, + "rewards/margins": 0.061369407922029495, + "rewards/rejected": -0.09173533320426941, + "sft_loss": 0.6073184609413147, + "step": 3110 + }, + { + "epoch": 2.492, + "grad_norm": 5.642504715902896, + "learning_rate": 4.2415843109050667e-07, + "logits/chosen": -0.9500174522399902, + "logits/rejected": -1.0051153898239136, + "logps/chosen": -0.3405509889125824, + "logps/rejected": -1.6603267192840576, + "loss": 0.4716, + "odds_ratio_loss": 0.1337331086397171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01702754944562912, + "rewards/margins": 0.06598879396915436, + "rewards/rejected": -0.08301634341478348, + "sft_loss": 0.3405509889125824, + "step": 3115 + }, + { + "epoch": 2.496, + "grad_norm": 6.4696426005968775, + "learning_rate": 4.1769689822475147e-07, + "logits/chosen": -0.57563316822052, + "logits/rejected": -0.9418606758117676, + "logps/chosen": -0.5054479837417603, + "logps/rejected": -1.5817244052886963, + "loss": 0.4687, + "odds_ratio_loss": 0.21483473479747772, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.025272399187088013, + "rewards/margins": 0.05381382629275322, + "rewards/rejected": -0.07908622920513153, + "sft_loss": 0.5054479837417603, + "step": 3120 + }, + { + "epoch": 2.5, + "grad_norm": 4.172914702601585, + "learning_rate": 4.1128047146765936e-07, + "logits/chosen": -0.7602171897888184, + "logits/rejected": -1.0409475564956665, + "logps/chosen": -0.40496939420700073, + "logps/rejected": -1.4085716009140015, + "loss": 0.4094, + "odds_ratio_loss": 0.21070821583271027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020248468965291977, + "rewards/margins": 0.050180114805698395, + "rewards/rejected": -0.07042858749628067, + "sft_loss": 0.40496939420700073, + "step": 3125 + }, + { + "epoch": 2.504, + "grad_norm": 5.627553812250054, + "learning_rate": 4.049092898095816e-07, + "logits/chosen": -0.8009653091430664, + "logits/rejected": -1.1446640491485596, + "logps/chosen": -0.302462637424469, + "logps/rejected": -1.758111596107483, + "loss": 0.5068, + "odds_ratio_loss": 0.08966059982776642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015123131684958935, + "rewards/margins": 0.07278244942426682, + "rewards/rejected": -0.08790557831525803, + "sft_loss": 0.302462637424469, + "step": 3130 + }, + { + "epoch": 2.508, + "grad_norm": 5.282423292390493, + "learning_rate": 3.9858349126078945e-07, + "logits/chosen": -1.0294119119644165, + "logits/rejected": -0.672359049320221, + "logps/chosen": -0.2604931592941284, + "logps/rejected": -1.511343002319336, + "loss": 0.4927, + "odds_ratio_loss": 0.10487516969442368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013024657964706421, + "rewards/margins": 0.06254249066114426, + "rewards/rejected": -0.07556714862585068, + "sft_loss": 0.2604931592941284, + "step": 3135 + }, + { + "epoch": 2.512, + "grad_norm": 5.2623944719501585, + "learning_rate": 3.9230321284847856e-07, + "logits/chosen": -0.9906686544418335, + "logits/rejected": -0.7455857992172241, + "logps/chosen": -0.599448561668396, + "logps/rejected": -1.501896619796753, + "loss": 0.4545, + "odds_ratio_loss": 0.2790692448616028, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02997243031859398, + "rewards/margins": 0.045122403651475906, + "rewards/rejected": -0.07509483397006989, + "sft_loss": 0.599448561668396, + "step": 3140 + }, + { + "epoch": 2.516, + "grad_norm": 5.779281261694176, + "learning_rate": 3.86068590613804e-07, + "logits/chosen": -0.8702551126480103, + "logits/rejected": -1.0743459463119507, + "logps/chosen": -0.2949695885181427, + "logps/rejected": -1.3731797933578491, + "loss": 0.4727, + "odds_ratio_loss": 0.13363251090049744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01474847923964262, + "rewards/margins": 0.0539105124771595, + "rewards/rejected": -0.0686589926481247, + "sft_loss": 0.2949695885181427, + "step": 3145 + }, + { + "epoch": 2.52, + "grad_norm": 6.263065115062914, + "learning_rate": 3.798797596089351e-07, + "logits/chosen": -1.070436716079712, + "logits/rejected": -1.0166070461273193, + "logps/chosen": -0.2887745499610901, + "logps/rejected": -1.5660569667816162, + "loss": 0.4361, + "odds_ratio_loss": 0.11832698434591293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014438727870583534, + "rewards/margins": 0.06386412680149078, + "rewards/rejected": -0.07830285280942917, + "sft_loss": 0.2887745499610901, + "step": 3150 + }, + { + "epoch": 2.524, + "grad_norm": 6.821197735395411, + "learning_rate": 3.737368538941255e-07, + "logits/chosen": -0.9708150029182434, + "logits/rejected": -0.8317630887031555, + "logps/chosen": -0.41128820180892944, + "logps/rejected": -1.2074782848358154, + "loss": 0.4127, + "odds_ratio_loss": 0.2423263043165207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020564410835504532, + "rewards/margins": 0.03980950638651848, + "rewards/rejected": -0.06037392094731331, + "sft_loss": 0.41128820180892944, + "step": 3155 + }, + { + "epoch": 2.528, + "grad_norm": 5.542329936785543, + "learning_rate": 3.6764000653481263e-07, + "logits/chosen": -1.2484712600708008, + "logits/rejected": -0.9088461995124817, + "logps/chosen": -0.3338097929954529, + "logps/rejected": -1.6267837285995483, + "loss": 0.4552, + "odds_ratio_loss": 0.12026141583919525, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.016690488904714584, + "rewards/margins": 0.06464870274066925, + "rewards/rejected": -0.08133919537067413, + "sft_loss": 0.3338097929954529, + "step": 3160 + }, + { + "epoch": 2.532, + "grad_norm": 8.97897074618829, + "learning_rate": 3.615893495987335e-07, + "logits/chosen": -0.5620380640029907, + "logits/rejected": -0.8675304651260376, + "logps/chosen": -0.5898982882499695, + "logps/rejected": -1.3201124668121338, + "loss": 0.4837, + "odds_ratio_loss": 0.3108993470668793, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.029494917020201683, + "rewards/margins": 0.036510709673166275, + "rewards/rejected": -0.0660056322813034, + "sft_loss": 0.5898982882499695, + "step": 3165 + }, + { + "epoch": 2.536, + "grad_norm": 4.681514042861988, + "learning_rate": 3.555850141530659e-07, + "logits/chosen": -1.0148943662643433, + "logits/rejected": -0.982884407043457, + "logps/chosen": -0.22993119060993195, + "logps/rejected": -1.4043452739715576, + "loss": 0.5221, + "odds_ratio_loss": 0.08220822364091873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011496557854115963, + "rewards/margins": 0.058720700442790985, + "rewards/rejected": -0.07021726667881012, + "sft_loss": 0.22993119060993195, + "step": 3170 + }, + { + "epoch": 2.54, + "grad_norm": 7.722013429404472, + "learning_rate": 3.4962713026158697e-07, + "logits/chosen": -0.5106195211410522, + "logits/rejected": -1.0413014888763428, + "logps/chosen": -0.6414362788200378, + "logps/rejected": -1.3033192157745361, + "loss": 0.4655, + "odds_ratio_loss": 0.3088650107383728, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03207181394100189, + "rewards/margins": 0.0330941416323185, + "rewards/rejected": -0.06516595929861069, + "sft_loss": 0.6414362788200378, + "step": 3175 + }, + { + "epoch": 2.544, + "grad_norm": 4.914705494262134, + "learning_rate": 3.4371582698185636e-07, + "logits/chosen": -1.18616783618927, + "logits/rejected": -0.963121771812439, + "logps/chosen": -0.31630367040634155, + "logps/rejected": -1.29836106300354, + "loss": 0.5218, + "odds_ratio_loss": 0.20544719696044922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015815183520317078, + "rewards/margins": 0.049102868884801865, + "rewards/rejected": -0.06491805613040924, + "sft_loss": 0.31630367040634155, + "step": 3180 + }, + { + "epoch": 2.548, + "grad_norm": 10.854835141924323, + "learning_rate": 3.378512323624228e-07, + "logits/chosen": -0.5074478387832642, + "logits/rejected": -1.2903783321380615, + "logps/chosen": -0.7434813976287842, + "logps/rejected": -1.6036231517791748, + "loss": 0.5155, + "odds_ratio_loss": 0.265799343585968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03717407211661339, + "rewards/margins": 0.04300709441304207, + "rewards/rejected": -0.08018116652965546, + "sft_loss": 0.7434813976287842, + "step": 3185 + }, + { + "epoch": 2.552, + "grad_norm": 5.764157322717205, + "learning_rate": 3.3203347344004737e-07, + "logits/chosen": -0.7570070028305054, + "logits/rejected": -1.1250250339508057, + "logps/chosen": -0.6759325265884399, + "logps/rejected": -1.439347267150879, + "loss": 0.5504, + "odds_ratio_loss": 0.3499813973903656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.033796630799770355, + "rewards/margins": 0.03817072883248329, + "rewards/rejected": -0.07196736335754395, + "sft_loss": 0.6759325265884399, + "step": 3190 + }, + { + "epoch": 2.556, + "grad_norm": 4.888945884008844, + "learning_rate": 3.262626762369525e-07, + "logits/chosen": -1.1923449039459229, + "logits/rejected": -0.8670722842216492, + "logps/chosen": -0.31347471475601196, + "logps/rejected": -1.4067668914794922, + "loss": 0.4052, + "odds_ratio_loss": 0.12347825616598129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01567373424768448, + "rewards/margins": 0.05466460436582565, + "rewards/rejected": -0.07033834606409073, + "sft_loss": 0.31347471475601196, + "step": 3195 + }, + { + "epoch": 2.56, + "grad_norm": 4.380022841165363, + "learning_rate": 3.2053896575809426e-07, + "logits/chosen": -0.5602080821990967, + "logits/rejected": -1.0972874164581299, + "logps/chosen": -0.7473759651184082, + "logps/rejected": -1.6633918285369873, + "loss": 0.4408, + "odds_ratio_loss": 0.2667500674724579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03736879676580429, + "rewards/margins": 0.04580079764127731, + "rewards/rejected": -0.0831695944070816, + "sft_loss": 0.7473759651184082, + "step": 3200 + }, + { + "epoch": 2.564, + "grad_norm": 5.674973517980157, + "learning_rate": 3.148624659884508e-07, + "logits/chosen": -0.7617183327674866, + "logits/rejected": -0.972303569316864, + "logps/chosen": -0.4322594702243805, + "logps/rejected": -1.4709136486053467, + "loss": 0.4488, + "odds_ratio_loss": 0.186048686504364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021612973883748055, + "rewards/margins": 0.05193271115422249, + "rewards/rejected": -0.0735456794500351, + "sft_loss": 0.4322594702243805, + "step": 3205 + }, + { + "epoch": 2.568, + "grad_norm": 4.977829670896469, + "learning_rate": 3.092332998903416e-07, + "logits/chosen": -0.8257712125778198, + "logits/rejected": -0.8016840219497681, + "logps/chosen": -0.4363314211368561, + "logps/rejected": -1.1737916469573975, + "loss": 0.4606, + "odds_ratio_loss": 0.22321203351020813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021816570311784744, + "rewards/margins": 0.03687301650643349, + "rewards/rejected": -0.05868958681821823, + "sft_loss": 0.4363314211368561, + "step": 3210 + }, + { + "epoch": 2.572, + "grad_norm": 5.0240652358480755, + "learning_rate": 3.0365158940075664e-07, + "logits/chosen": -1.0381290912628174, + "logits/rejected": -1.1659702062606812, + "logps/chosen": -0.5309845209121704, + "logps/rejected": -1.4535753726959229, + "loss": 0.6035, + "odds_ratio_loss": 0.28817451000213623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02654922381043434, + "rewards/margins": 0.04612954333424568, + "rewards/rejected": -0.07267877459526062, + "sft_loss": 0.5309845209121704, + "step": 3215 + }, + { + "epoch": 2.576, + "grad_norm": 5.408869231167505, + "learning_rate": 2.981174554287239e-07, + "logits/chosen": -1.015199899673462, + "logits/rejected": -1.3747820854187012, + "logps/chosen": -0.4069809317588806, + "logps/rejected": -1.5189340114593506, + "loss": 0.4696, + "odds_ratio_loss": 0.16449251770973206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02034904807806015, + "rewards/margins": 0.05559765547513962, + "rewards/rejected": -0.07594670355319977, + "sft_loss": 0.4069809317588806, + "step": 3220 + }, + { + "epoch": 2.58, + "grad_norm": 7.760925759203226, + "learning_rate": 2.9263101785268253e-07, + "logits/chosen": -0.5573514699935913, + "logits/rejected": -1.252190113067627, + "logps/chosen": -0.41566723585128784, + "logps/rejected": -1.4632532596588135, + "loss": 0.4529, + "odds_ratio_loss": 0.16219457983970642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02078336291015148, + "rewards/margins": 0.0523792989552021, + "rewards/rejected": -0.07316266000270844, + "sft_loss": 0.41566723585128784, + "step": 3225 + }, + { + "epoch": 2.584, + "grad_norm": 6.9168438538457995, + "learning_rate": 2.871923955178918e-07, + "logits/chosen": -0.7818273901939392, + "logits/rejected": -0.9009189605712891, + "logps/chosen": -0.5484607219696045, + "logps/rejected": -1.4074246883392334, + "loss": 0.4963, + "odds_ratio_loss": 0.2880396842956543, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.027423039078712463, + "rewards/margins": 0.042948197573423386, + "rewards/rejected": -0.07037124037742615, + "sft_loss": 0.5484607219696045, + "step": 3230 + }, + { + "epoch": 2.588, + "grad_norm": 5.722693546238487, + "learning_rate": 2.8180170623385213e-07, + "logits/chosen": -0.9606497883796692, + "logits/rejected": -0.9924991726875305, + "logps/chosen": -0.4993307590484619, + "logps/rejected": -2.206209659576416, + "loss": 0.4349, + "odds_ratio_loss": 0.2471529245376587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024966537952423096, + "rewards/margins": 0.08534395694732666, + "rewards/rejected": -0.11031049489974976, + "sft_loss": 0.4993307590484619, + "step": 3235 + }, + { + "epoch": 2.592, + "grad_norm": 5.892839113416792, + "learning_rate": 2.764590667717562e-07, + "logits/chosen": -0.47418349981307983, + "logits/rejected": -1.210965871810913, + "logps/chosen": -0.5353994965553284, + "logps/rejected": -1.8738822937011719, + "loss": 0.4341, + "odds_ratio_loss": 0.2881673276424408, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02676997520029545, + "rewards/margins": 0.06692413985729218, + "rewards/rejected": -0.09369411319494247, + "sft_loss": 0.5353994965553284, + "step": 3240 + }, + { + "epoch": 2.596, + "grad_norm": 4.975415723996069, + "learning_rate": 2.7116459286195887e-07, + "logits/chosen": -1.0550581216812134, + "logits/rejected": -1.0994212627410889, + "logps/chosen": -0.29386892914772034, + "logps/rejected": -1.3806931972503662, + "loss": 0.4509, + "odds_ratio_loss": 0.1265828013420105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014693446457386017, + "rewards/margins": 0.054341208189725876, + "rewards/rejected": -0.0690346509218216, + "sft_loss": 0.29386892914772034, + "step": 3245 + }, + { + "epoch": 2.6, + "grad_norm": 4.814970729318306, + "learning_rate": 2.6591839919146963e-07, + "logits/chosen": -0.7558291554450989, + "logits/rejected": -0.8831847906112671, + "logps/chosen": -0.4392101764678955, + "logps/rejected": -1.304246187210083, + "loss": 0.494, + "odds_ratio_loss": 0.24885766208171844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021960508078336716, + "rewards/margins": 0.04325180500745773, + "rewards/rejected": -0.06521230936050415, + "sft_loss": 0.4392101764678955, + "step": 3250 + }, + { + "epoch": 2.604, + "grad_norm": 8.307336904912425, + "learning_rate": 2.6072059940146775e-07, + "logits/chosen": -1.1570188999176025, + "logits/rejected": -0.8211970329284668, + "logps/chosen": -0.4389687180519104, + "logps/rejected": -1.5756551027297974, + "loss": 0.4346, + "odds_ratio_loss": 0.1567719727754593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0219484381377697, + "rewards/margins": 0.05683432146906853, + "rewards/rejected": -0.07878275215625763, + "sft_loss": 0.4389687180519104, + "step": 3255 + }, + { + "epoch": 2.608, + "grad_norm": 5.842083846009801, + "learning_rate": 2.555713060848433e-07, + "logits/chosen": -0.9055282473564148, + "logits/rejected": -1.0206440687179565, + "logps/chosen": -0.6034643650054932, + "logps/rejected": -1.3517249822616577, + "loss": 0.4756, + "odds_ratio_loss": 0.37321245670318604, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030173221603035927, + "rewards/margins": 0.037413034588098526, + "rewards/rejected": -0.067586250603199, + "sft_loss": 0.6034643650054932, + "step": 3260 + }, + { + "epoch": 2.612, + "grad_norm": 26.57275268189309, + "learning_rate": 2.504706307837551e-07, + "logits/chosen": -0.9575685262680054, + "logits/rejected": -1.1905924081802368, + "logps/chosen": -0.43124690651893616, + "logps/rejected": -1.5143071413040161, + "loss": 0.4799, + "odds_ratio_loss": 0.24718472361564636, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.021562347188591957, + "rewards/margins": 0.054153017699718475, + "rewards/rejected": -0.07571535557508469, + "sft_loss": 0.43124690651893616, + "step": 3265 + }, + { + "epoch": 2.616, + "grad_norm": 12.548046244131415, + "learning_rate": 2.454186839872158e-07, + "logits/chosen": -0.8983935117721558, + "logits/rejected": -0.9400730133056641, + "logps/chosen": -0.5530645251274109, + "logps/rejected": -1.652138113975525, + "loss": 0.4669, + "odds_ratio_loss": 0.21761877834796906, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.027653228491544724, + "rewards/margins": 0.054953683167696, + "rewards/rejected": -0.08260690420866013, + "sft_loss": 0.5530645251274109, + "step": 3270 + }, + { + "epoch": 2.62, + "grad_norm": 5.389650137843442, + "learning_rate": 2.404155751286988e-07, + "logits/chosen": -1.0610014200210571, + "logits/rejected": -1.133551836013794, + "logps/chosen": -0.39243531227111816, + "logps/rejected": -1.79599130153656, + "loss": 0.4603, + "odds_ratio_loss": 0.2275979220867157, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.01962176337838173, + "rewards/margins": 0.07017780840396881, + "rewards/rejected": -0.08979956060647964, + "sft_loss": 0.39243531227111816, + "step": 3275 + }, + { + "epoch": 2.624, + "grad_norm": 5.664088641927079, + "learning_rate": 2.3546141258376786e-07, + "logits/chosen": -0.7254642248153687, + "logits/rejected": -0.6703656911849976, + "logps/chosen": -0.3163744807243347, + "logps/rejected": -1.2279107570648193, + "loss": 0.5009, + "odds_ratio_loss": 0.15358732640743256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015818724408745766, + "rewards/margins": 0.04557682201266289, + "rewards/rejected": -0.0613955482840538, + "sft_loss": 0.3163744807243347, + "step": 3280 + }, + { + "epoch": 2.628, + "grad_norm": 4.6344681245327495, + "learning_rate": 2.3055630366772857e-07, + "logits/chosen": -0.7210429906845093, + "logits/rejected": -0.7038585543632507, + "logps/chosen": -0.45490559935569763, + "logps/rejected": -1.3154146671295166, + "loss": 0.5243, + "odds_ratio_loss": 0.2467753142118454, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.022745277732610703, + "rewards/margins": 0.04302545264363289, + "rewards/rejected": -0.06577073037624359, + "sft_loss": 0.45490559935569763, + "step": 3285 + }, + { + "epoch": 2.632, + "grad_norm": 4.8837015178641705, + "learning_rate": 2.257003546333042e-07, + "logits/chosen": -0.9644176363945007, + "logits/rejected": -1.0574061870574951, + "logps/chosen": -0.4861271381378174, + "logps/rejected": -1.230362892150879, + "loss": 0.506, + "odds_ratio_loss": 0.223617285490036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02430635876953602, + "rewards/margins": 0.03721178323030472, + "rewards/rejected": -0.061518143862485886, + "sft_loss": 0.4861271381378174, + "step": 3290 + }, + { + "epoch": 2.636, + "grad_norm": 4.840338076295951, + "learning_rate": 2.208936706683351e-07, + "logits/chosen": -0.747097909450531, + "logits/rejected": -0.7051985859870911, + "logps/chosen": -0.5597037076950073, + "logps/rejected": -1.4610236883163452, + "loss": 0.4651, + "odds_ratio_loss": 0.3216997981071472, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.027985185384750366, + "rewards/margins": 0.045065999031066895, + "rewards/rejected": -0.07305117696523666, + "sft_loss": 0.5597037076950073, + "step": 3295 + }, + { + "epoch": 2.64, + "grad_norm": 7.554990598127512, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -0.8587006330490112, + "logits/rejected": -0.9386578798294067, + "logps/chosen": -0.4001200199127197, + "logps/rejected": -1.5990378856658936, + "loss": 0.4853, + "odds_ratio_loss": 0.2407374083995819, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.020006002858281136, + "rewards/margins": 0.05994589254260063, + "rewards/rejected": -0.07995189726352692, + "sft_loss": 0.4001200199127197, + "step": 3300 + }, + { + "epoch": 2.644, + "grad_norm": 9.49274069461471, + "learning_rate": 2.1142851336005244e-07, + "logits/chosen": -1.1812444925308228, + "logits/rejected": -1.0167138576507568, + "logps/chosen": -0.39482712745666504, + "logps/rejected": -1.4087440967559814, + "loss": 0.4741, + "odds_ratio_loss": 0.14712780714035034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019741356372833252, + "rewards/margins": 0.05069585517048836, + "rewards/rejected": -0.07043720781803131, + "sft_loss": 0.39482712745666504, + "step": 3305 + }, + { + "epoch": 2.648, + "grad_norm": 6.995561448070307, + "learning_rate": 2.0677024504760752e-07, + "logits/chosen": -1.0624709129333496, + "logits/rejected": -1.4392796754837036, + "logps/chosen": -0.5113095641136169, + "logps/rejected": -1.4682908058166504, + "loss": 0.4876, + "odds_ratio_loss": 0.37565088272094727, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.025565480813384056, + "rewards/margins": 0.04784905165433884, + "rewards/rejected": -0.07341454178094864, + "sft_loss": 0.5113095641136169, + "step": 3310 + }, + { + "epoch": 2.652, + "grad_norm": 7.478983570395769, + "learning_rate": 2.0216165186191406e-07, + "logits/chosen": -0.7986949682235718, + "logits/rejected": -1.3443512916564941, + "logps/chosen": -0.6050113439559937, + "logps/rejected": -1.6553246974945068, + "loss": 0.4864, + "odds_ratio_loss": 0.1889444887638092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030250567942857742, + "rewards/margins": 0.052515674382448196, + "rewards/rejected": -0.08276623487472534, + "sft_loss": 0.6050113439559937, + "step": 3315 + }, + { + "epoch": 2.656, + "grad_norm": 5.628793749435426, + "learning_rate": 1.9760283363267684e-07, + "logits/chosen": -0.96771240234375, + "logits/rejected": -0.6631850004196167, + "logps/chosen": -0.5663673877716064, + "logps/rejected": -1.225140929222107, + "loss": 0.5049, + "odds_ratio_loss": 0.3630017340183258, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.028318369761109352, + "rewards/margins": 0.03293868154287338, + "rewards/rejected": -0.06125704571604729, + "sft_loss": 0.5663673877716064, + "step": 3320 + }, + { + "epoch": 2.66, + "grad_norm": 4.202178749975983, + "learning_rate": 1.9309388911139427e-07, + "logits/chosen": -0.8587250709533691, + "logits/rejected": -1.0437920093536377, + "logps/chosen": -0.5410333871841431, + "logps/rejected": -1.3033370971679688, + "loss": 0.4574, + "odds_ratio_loss": 0.25340771675109863, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.027051672339439392, + "rewards/margins": 0.038115184754133224, + "rewards/rejected": -0.06516685336828232, + "sft_loss": 0.5410333871841431, + "step": 3325 + }, + { + "epoch": 2.664, + "grad_norm": 4.996160979584086, + "learning_rate": 1.8863491596921745e-07, + "logits/chosen": -0.9663417935371399, + "logits/rejected": -1.0380603075027466, + "logps/chosen": -0.42533501982688904, + "logps/rejected": -1.5897793769836426, + "loss": 0.4536, + "odds_ratio_loss": 0.1942438781261444, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0212667528539896, + "rewards/margins": 0.058222223073244095, + "rewards/rejected": -0.07948897033929825, + "sft_loss": 0.42533501982688904, + "step": 3330 + }, + { + "epoch": 2.668, + "grad_norm": 5.226973499413544, + "learning_rate": 1.8422601079483516e-07, + "logits/chosen": -0.8443425297737122, + "logits/rejected": -1.0523451566696167, + "logps/chosen": -0.6304419636726379, + "logps/rejected": -1.6065928936004639, + "loss": 0.4965, + "odds_ratio_loss": 0.25557130575180054, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03152209892868996, + "rewards/margins": 0.048807550221681595, + "rewards/rejected": -0.08032964915037155, + "sft_loss": 0.6304419636726379, + "step": 3335 + }, + { + "epoch": 2.672, + "grad_norm": 5.151887544991019, + "learning_rate": 1.798672690923828e-07, + "logits/chosen": -0.4008614122867584, + "logits/rejected": -1.028642177581787, + "logps/chosen": -0.4695549011230469, + "logps/rejected": -1.3987529277801514, + "loss": 0.4852, + "odds_ratio_loss": 0.2155243456363678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023477744311094284, + "rewards/margins": 0.046459902077913284, + "rewards/rejected": -0.06993765383958817, + "sft_loss": 0.4695549011230469, + "step": 3340 + }, + { + "epoch": 2.676, + "grad_norm": 4.960271872554221, + "learning_rate": 1.7555878527937164e-07, + "logits/chosen": -1.132819414138794, + "logits/rejected": -1.2651735544204712, + "logps/chosen": -0.2899089753627777, + "logps/rejected": -1.4967478513717651, + "loss": 0.4768, + "odds_ratio_loss": 0.12462642043828964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014495449140667915, + "rewards/margins": 0.06034195423126221, + "rewards/rejected": -0.07483740150928497, + "sft_loss": 0.2899089753627777, + "step": 3345 + }, + { + "epoch": 2.68, + "grad_norm": 5.252700869191946, + "learning_rate": 1.713006526846439e-07, + "logits/chosen": -1.0679802894592285, + "logits/rejected": -1.2805407047271729, + "logps/chosen": -0.6222777366638184, + "logps/rejected": -1.4826539754867554, + "loss": 0.4673, + "odds_ratio_loss": 0.23923306167125702, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.031113887205719948, + "rewards/margins": 0.04301881417632103, + "rewards/rejected": -0.07413269579410553, + "sft_loss": 0.6222777366638184, + "step": 3350 + }, + { + "epoch": 2.684, + "grad_norm": 6.088871700564241, + "learning_rate": 1.6709296354635335e-07, + "logits/chosen": -0.7418292760848999, + "logits/rejected": -1.1808269023895264, + "logps/chosen": -0.7619951963424683, + "logps/rejected": -1.662157654762268, + "loss": 0.514, + "odds_ratio_loss": 0.3287936747074127, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.038099758327007294, + "rewards/margins": 0.04500812292098999, + "rewards/rejected": -0.08310787379741669, + "sft_loss": 0.7619951963424683, + "step": 3355 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 4.838274349890116, + "learning_rate": 1.629358090099639e-07, + "logits/chosen": -1.0373485088348389, + "logits/rejected": -1.0439493656158447, + "logps/chosen": -0.42322856187820435, + "logps/rejected": -1.5781844854354858, + "loss": 0.4928, + "odds_ratio_loss": 0.1596982479095459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021161429584026337, + "rewards/margins": 0.05774780362844467, + "rewards/rejected": -0.07890923321247101, + "sft_loss": 0.42322856187820435, + "step": 3360 + }, + { + "epoch": 2.692, + "grad_norm": 4.434071150396496, + "learning_rate": 1.5882927912627772e-07, + "logits/chosen": -0.8480218648910522, + "logits/rejected": -0.9117224812507629, + "logps/chosen": -0.4945312440395355, + "logps/rejected": -1.384734034538269, + "loss": 0.4637, + "odds_ratio_loss": 0.23109391331672668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024726565927267075, + "rewards/margins": 0.0445101372897625, + "rewards/rejected": -0.06923670321702957, + "sft_loss": 0.4945312440395355, + "step": 3365 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 5.347217671653992, + "learning_rate": 1.5477346284948292e-07, + "logits/chosen": -1.1148836612701416, + "logits/rejected": -1.0766358375549316, + "logps/chosen": -0.2054683268070221, + "logps/rejected": -1.9612598419189453, + "loss": 0.352, + "odds_ratio_loss": 0.04080657660961151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010273417457938194, + "rewards/margins": 0.08778958767652512, + "rewards/rejected": -0.09806300699710846, + "sft_loss": 0.2054683268070221, + "step": 3370 + }, + { + "epoch": 2.7, + "grad_norm": 7.112953927848122, + "learning_rate": 1.507684480352292e-07, + "logits/chosen": -0.86543208360672, + "logits/rejected": -1.3143550157546997, + "logps/chosen": -0.7909175157546997, + "logps/rejected": -1.7691437005996704, + "loss": 0.4695, + "odds_ratio_loss": 0.23235614597797394, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03954587131738663, + "rewards/margins": 0.04891129583120346, + "rewards/rejected": -0.08845716714859009, + "sft_loss": 0.7909175157546997, + "step": 3375 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 7.9988782226086155, + "learning_rate": 1.4681432143872133e-07, + "logits/chosen": -0.6214197874069214, + "logits/rejected": -1.0236066579818726, + "logps/chosen": -0.5888375639915466, + "logps/rejected": -1.7203214168548584, + "loss": 0.4666, + "odds_ratio_loss": 0.1753186732530594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02944187819957733, + "rewards/margins": 0.056574203073978424, + "rewards/rejected": -0.08601607382297516, + "sft_loss": 0.5888375639915466, + "step": 3380 + }, + { + "epoch": 2.708, + "grad_norm": 3.7336969896252983, + "learning_rate": 1.4291116871284205e-07, + "logits/chosen": -1.00608229637146, + "logits/rejected": -1.003975510597229, + "logps/chosen": -0.43859434127807617, + "logps/rejected": -1.8750841617584229, + "loss": 0.4176, + "odds_ratio_loss": 0.21985983848571777, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02192971669137478, + "rewards/margins": 0.07182449102401733, + "rewards/rejected": -0.09375420212745667, + "sft_loss": 0.43859434127807617, + "step": 3385 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 9.218065040509618, + "learning_rate": 1.3905907440629752e-07, + "logits/chosen": -0.9918481111526489, + "logits/rejected": -1.3855509757995605, + "logps/chosen": -0.3834057152271271, + "logps/rejected": -1.7054436206817627, + "loss": 0.4777, + "odds_ratio_loss": 0.16741231083869934, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019170286133885384, + "rewards/margins": 0.06610190868377686, + "rewards/rejected": -0.08527218550443649, + "sft_loss": 0.3834057152271271, + "step": 3390 + }, + { + "epoch": 2.716, + "grad_norm": 4.274440200102076, + "learning_rate": 1.352581219617824e-07, + "logits/chosen": -0.6903150677680969, + "logits/rejected": -1.1200445890426636, + "logps/chosen": -0.3996601402759552, + "logps/rejected": -1.8368583917617798, + "loss": 0.4716, + "odds_ratio_loss": 0.13882999122142792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01998300477862358, + "rewards/margins": 0.07185991108417511, + "rewards/rejected": -0.09184291958808899, + "sft_loss": 0.3996601402759552, + "step": 3395 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 4.630834348255805, + "learning_rate": 1.31508393714177e-07, + "logits/chosen": -1.1776244640350342, + "logits/rejected": -0.7361685037612915, + "logps/chosen": -0.2189209908246994, + "logps/rejected": -1.2366139888763428, + "loss": 0.3637, + "odds_ratio_loss": 0.09817551076412201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010946051217615604, + "rewards/margins": 0.050884656608104706, + "rewards/rejected": -0.061830706894397736, + "sft_loss": 0.2189209908246994, + "step": 3400 + }, + { + "epoch": 2.724, + "grad_norm": 5.857849838216993, + "learning_rate": 1.278099708887587e-07, + "logits/chosen": -0.8234980702400208, + "logits/rejected": -0.843535304069519, + "logps/chosen": -0.36947911977767944, + "logps/rejected": -1.4612740278244019, + "loss": 0.475, + "odds_ratio_loss": 0.1974354237318039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018473956733942032, + "rewards/margins": 0.054589755833148956, + "rewards/rejected": -0.07306370884180069, + "sft_loss": 0.36947911977767944, + "step": 3405 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 6.068229298253824, + "learning_rate": 1.241629335994471e-07, + "logits/chosen": -0.7435353994369507, + "logits/rejected": -1.0612701177597046, + "logps/chosen": -0.5406766533851624, + "logps/rejected": -1.4909696578979492, + "loss": 0.4116, + "odds_ratio_loss": 0.30309200286865234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.027033831924200058, + "rewards/margins": 0.04751463979482651, + "rewards/rejected": -0.07454848289489746, + "sft_loss": 0.5406766533851624, + "step": 3410 + }, + { + "epoch": 2.732, + "grad_norm": 8.977086173159117, + "learning_rate": 1.2056736084706588e-07, + "logits/chosen": -0.9622576832771301, + "logits/rejected": -1.2189757823944092, + "logps/chosen": -0.43772053718566895, + "logps/rejected": -1.773707628250122, + "loss": 0.4551, + "odds_ratio_loss": 0.18512067198753357, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.021886030212044716, + "rewards/margins": 0.0667993575334549, + "rewards/rejected": -0.08868537843227386, + "sft_loss": 0.43772053718566895, + "step": 3415 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 5.513247634423349, + "learning_rate": 1.1702333051763271e-07, + "logits/chosen": -0.6844684481620789, + "logits/rejected": -1.1717528104782104, + "logps/chosen": -0.5392543077468872, + "logps/rejected": -1.2498254776000977, + "loss": 0.4714, + "odds_ratio_loss": 0.30782368779182434, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02696271799504757, + "rewards/margins": 0.03552855923771858, + "rewards/rejected": -0.062491275370121, + "sft_loss": 0.5392543077468872, + "step": 3420 + }, + { + "epoch": 2.74, + "grad_norm": 3.9149151857473714, + "learning_rate": 1.1353091938067024e-07, + "logits/chosen": -0.6818164587020874, + "logits/rejected": -0.7006502151489258, + "logps/chosen": -0.4631117880344391, + "logps/rejected": -1.6541814804077148, + "loss": 0.434, + "odds_ratio_loss": 0.186067134141922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023155588656663895, + "rewards/margins": 0.059553492814302444, + "rewards/rejected": -0.08270907402038574, + "sft_loss": 0.4631117880344391, + "step": 3425 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 5.1831375438532135, + "learning_rate": 1.1009020308754587e-07, + "logits/chosen": -0.7273832559585571, + "logits/rejected": -0.6493693590164185, + "logps/chosen": -0.17281028628349304, + "logps/rejected": -1.4289897680282593, + "loss": 0.4975, + "odds_ratio_loss": 0.1179138645529747, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008640513755381107, + "rewards/margins": 0.06280897557735443, + "rewards/rejected": -0.07144948095083237, + "sft_loss": 0.17281028628349304, + "step": 3430 + }, + { + "epoch": 2.748, + "grad_norm": 4.973491478841313, + "learning_rate": 1.067012561698319e-07, + "logits/chosen": -1.207350730895996, + "logits/rejected": -0.8244549632072449, + "logps/chosen": -0.5802727937698364, + "logps/rejected": -1.568420648574829, + "loss": 0.4946, + "odds_ratio_loss": 0.22998332977294922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029013637453317642, + "rewards/margins": 0.04940740019083023, + "rewards/rejected": -0.07842103391885757, + "sft_loss": 0.5802727937698364, + "step": 3435 + }, + { + "epoch": 2.752, + "grad_norm": 7.538216183918059, + "learning_rate": 1.0336415203768962e-07, + "logits/chosen": -0.7276453971862793, + "logits/rejected": -1.1800649166107178, + "logps/chosen": -0.268561452627182, + "logps/rejected": -1.8346469402313232, + "loss": 0.4621, + "odds_ratio_loss": 0.10011246055364609, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013428074307739735, + "rewards/margins": 0.07830427587032318, + "rewards/rejected": -0.09173235297203064, + "sft_loss": 0.268561452627182, + "step": 3440 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 4.385995959035907, + "learning_rate": 1.0007896297828113e-07, + "logits/chosen": -0.9866234660148621, + "logits/rejected": -0.8177453875541687, + "logps/chosen": -0.2839185297489166, + "logps/rejected": -1.2978515625, + "loss": 0.4541, + "odds_ratio_loss": 0.13113179802894592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014195924624800682, + "rewards/margins": 0.05069665238261223, + "rewards/rejected": -0.06489257514476776, + "sft_loss": 0.2839185297489166, + "step": 3445 + }, + { + "epoch": 2.76, + "grad_norm": 5.040991958957209, + "learning_rate": 9.684576015420277e-08, + "logits/chosen": -1.0320391654968262, + "logits/rejected": -0.9190491437911987, + "logps/chosen": -0.49723321199417114, + "logps/rejected": -2.3937244415283203, + "loss": 0.4407, + "odds_ratio_loss": 0.21446876227855682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024861661717295647, + "rewards/margins": 0.09482455253601074, + "rewards/rejected": -0.11968620866537094, + "sft_loss": 0.49723321199417114, + "step": 3450 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 6.834196455601754, + "learning_rate": 9.36646136019434e-08, + "logits/chosen": -1.0055029392242432, + "logits/rejected": -0.663110077381134, + "logps/chosen": -0.2397165298461914, + "logps/rejected": -1.2084038257598877, + "loss": 0.4345, + "odds_ratio_loss": 0.11389895528554916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011985826306045055, + "rewards/margins": 0.04843435436487198, + "rewards/rejected": -0.06042018532752991, + "sft_loss": 0.2397165298461914, + "step": 3455 + }, + { + "epoch": 2.768, + "grad_norm": 6.622662942386174, + "learning_rate": 9.053559223036746e-08, + "logits/chosen": -0.5032299757003784, + "logits/rejected": -0.8415955305099487, + "logps/chosen": -0.7557247877120972, + "logps/rejected": -1.9935903549194336, + "loss": 0.4693, + "odds_ratio_loss": 0.29048803448677063, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03778623789548874, + "rewards/margins": 0.0618932731449604, + "rewards/rejected": -0.09967950731515884, + "sft_loss": 0.7557247877120972, + "step": 3460 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 5.779744351491692, + "learning_rate": 8.745876381922147e-08, + "logits/chosen": -0.9570730924606323, + "logits/rejected": -1.0343133211135864, + "logps/chosen": -0.3262855112552643, + "logps/rejected": -1.9049714803695679, + "loss": 0.4068, + "odds_ratio_loss": 0.1138528436422348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016314277425408363, + "rewards/margins": 0.07893429696559906, + "rewards/rejected": -0.09524857252836227, + "sft_loss": 0.3262855112552643, + "step": 3465 + }, + { + "epoch": 2.776, + "grad_norm": 6.075626717992663, + "learning_rate": 8.44341950176683e-08, + "logits/chosen": -0.8139813542366028, + "logits/rejected": -1.261385202407837, + "logps/chosen": -0.5494598150253296, + "logps/rejected": -1.436342477798462, + "loss": 0.474, + "odds_ratio_loss": 0.2713177800178528, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02747299149632454, + "rewards/margins": 0.044344138354063034, + "rewards/rejected": -0.07181712985038757, + "sft_loss": 0.5494598150253296, + "step": 3470 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 6.027330370816976, + "learning_rate": 8.146195134284052e-08, + "logits/chosen": -1.251039981842041, + "logits/rejected": -1.2202528715133667, + "logps/chosen": -0.4107237458229065, + "logps/rejected": -1.591752052307129, + "loss": 0.4984, + "odds_ratio_loss": 0.1679602563381195, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.020536188036203384, + "rewards/margins": 0.05905142426490784, + "rewards/rejected": -0.07958760112524033, + "sft_loss": 0.4107237458229065, + "step": 3475 + }, + { + "epoch": 2.784, + "grad_norm": 5.9048587954628315, + "learning_rate": 7.854209717842231e-08, + "logits/chosen": -0.8869150280952454, + "logits/rejected": -1.0556492805480957, + "logps/chosen": -0.4128708243370056, + "logps/rejected": -1.244500756263733, + "loss": 0.4525, + "odds_ratio_loss": 0.25618261098861694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02064354531466961, + "rewards/margins": 0.041581496596336365, + "rewards/rejected": -0.06222504377365112, + "sft_loss": 0.4128708243370056, + "step": 3480 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 5.609907193598404, + "learning_rate": 7.567469577325598e-08, + "logits/chosen": -1.0691337585449219, + "logits/rejected": -1.0011084079742432, + "logps/chosen": -0.2602657973766327, + "logps/rejected": -1.5600097179412842, + "loss": 0.4698, + "odds_ratio_loss": 0.10053672641515732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013013291172683239, + "rewards/margins": 0.0649871900677681, + "rewards/rejected": -0.07800048589706421, + "sft_loss": 0.2602657973766327, + "step": 3485 + }, + { + "epoch": 2.792, + "grad_norm": 6.288586916197363, + "learning_rate": 7.285980923996989e-08, + "logits/chosen": -0.814477801322937, + "logits/rejected": -1.0002424716949463, + "logps/chosen": -0.39793896675109863, + "logps/rejected": -1.5965421199798584, + "loss": 0.4963, + "odds_ratio_loss": 0.15374401211738586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01989694871008396, + "rewards/margins": 0.05993015691637993, + "rewards/rejected": -0.07982710003852844, + "sft_loss": 0.39793896675109863, + "step": 3490 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 7.196669432259674, + "learning_rate": 7.009749855363457e-08, + "logits/chosen": -0.8080123066902161, + "logits/rejected": -0.828715980052948, + "logps/chosen": -0.4427156448364258, + "logps/rejected": -1.4885919094085693, + "loss": 0.467, + "odds_ratio_loss": 0.21041274070739746, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02213578298687935, + "rewards/margins": 0.0522938147187233, + "rewards/rejected": -0.07442959398031235, + "sft_loss": 0.4427156448364258, + "step": 3495 + }, + { + "epoch": 2.8, + "grad_norm": 6.610897880824554, + "learning_rate": 6.738782355044048e-08, + "logits/chosen": -0.7845413088798523, + "logits/rejected": -1.2522486448287964, + "logps/chosen": -0.19464164972305298, + "logps/rejected": -1.954445242881775, + "loss": 0.5121, + "odds_ratio_loss": 0.07869584858417511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009732084348797798, + "rewards/margins": 0.0879901796579361, + "rewards/rejected": -0.09772225469350815, + "sft_loss": 0.19464164972305298, + "step": 3500 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 5.994838172411329, + "learning_rate": 6.47308429264032e-08, + "logits/chosen": -0.6243001818656921, + "logits/rejected": -0.9205516576766968, + "logps/chosen": -0.45419034361839294, + "logps/rejected": -1.346792221069336, + "loss": 0.4879, + "odds_ratio_loss": 0.21579177677631378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022709516808390617, + "rewards/margins": 0.04463009163737297, + "rewards/rejected": -0.06733961403369904, + "sft_loss": 0.45419034361839294, + "step": 3505 + }, + { + "epoch": 2.808, + "grad_norm": 5.18964559816438, + "learning_rate": 6.212661423609184e-08, + "logits/chosen": -0.7687469720840454, + "logits/rejected": -0.9978445172309875, + "logps/chosen": -0.38948315382003784, + "logps/rejected": -1.690354585647583, + "loss": 0.4909, + "odds_ratio_loss": 0.1466546207666397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01947415992617607, + "rewards/margins": 0.06504356116056442, + "rewards/rejected": -0.08451772481203079, + "sft_loss": 0.38948315382003784, + "step": 3510 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 4.278585362133975, + "learning_rate": 5.957519389138106e-08, + "logits/chosen": -0.8727335929870605, + "logits/rejected": -0.9041558504104614, + "logps/chosen": -0.5910320281982422, + "logps/rejected": -1.4024748802185059, + "loss": 0.468, + "odds_ratio_loss": 0.2644711434841156, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02955160103738308, + "rewards/margins": 0.0405721440911293, + "rewards/rejected": -0.07012374699115753, + "sft_loss": 0.5910320281982422, + "step": 3515 + }, + { + "epoch": 2.816, + "grad_norm": 8.621278806090325, + "learning_rate": 5.707663716023021e-08, + "logits/chosen": -0.7765305042266846, + "logits/rejected": -1.2130156755447388, + "logps/chosen": -0.4113030433654785, + "logps/rejected": -1.911913275718689, + "loss": 0.4462, + "odds_ratio_loss": 0.16190369427204132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020565154030919075, + "rewards/margins": 0.07503052800893784, + "rewards/rejected": -0.09559567272663116, + "sft_loss": 0.4113030433654785, + "step": 3520 + }, + { + "epoch": 2.82, + "grad_norm": 5.895614935922102, + "learning_rate": 5.463099816548578e-08, + "logits/chosen": -0.8122976422309875, + "logits/rejected": -1.2246438264846802, + "logps/chosen": -0.6956444978713989, + "logps/rejected": -1.742977499961853, + "loss": 0.4596, + "odds_ratio_loss": 0.2761508822441101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03478222340345383, + "rewards/margins": 0.05236666277050972, + "rewards/rejected": -0.08714888989925385, + "sft_loss": 0.6956444978713989, + "step": 3525 + }, + { + "epoch": 2.824, + "grad_norm": 5.257809415391807, + "learning_rate": 5.22383298837098e-08, + "logits/chosen": -1.0394823551177979, + "logits/rejected": -1.370062232017517, + "logps/chosen": -0.6062598824501038, + "logps/rejected": -1.899864912033081, + "loss": 0.5637, + "odds_ratio_loss": 0.30006757378578186, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030312994495034218, + "rewards/margins": 0.06468025594949722, + "rewards/rejected": -0.0949932411313057, + "sft_loss": 0.6062598824501038, + "step": 3530 + }, + { + "epoch": 2.828, + "grad_norm": 5.075614233899885, + "learning_rate": 4.989868414403048e-08, + "logits/chosen": -0.7696380615234375, + "logits/rejected": -0.9930199384689331, + "logps/chosen": -0.5346277952194214, + "logps/rejected": -2.1930575370788574, + "loss": 0.5216, + "odds_ratio_loss": 0.2663384974002838, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02673139050602913, + "rewards/margins": 0.08292149752378464, + "rewards/rejected": -0.10965289175510406, + "sft_loss": 0.5346277952194214, + "step": 3535 + }, + { + "epoch": 2.832, + "grad_norm": 4.417337663334965, + "learning_rate": 4.761211162702117e-08, + "logits/chosen": -1.0091744661331177, + "logits/rejected": -0.9187496900558472, + "logps/chosen": -0.3657459616661072, + "logps/rejected": -1.539928674697876, + "loss": 0.5018, + "odds_ratio_loss": 0.1328873336315155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018287301063537598, + "rewards/margins": 0.05870913341641426, + "rewards/rejected": -0.07699643075466156, + "sft_loss": 0.3657459616661072, + "step": 3540 + }, + { + "epoch": 2.836, + "grad_norm": 4.546971390558208, + "learning_rate": 4.537866186360207e-08, + "logits/chosen": -0.7591463327407837, + "logits/rejected": -0.7273364067077637, + "logps/chosen": -0.42804569005966187, + "logps/rejected": -1.2514833211898804, + "loss": 0.4635, + "odds_ratio_loss": 0.2270592749118805, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02140228822827339, + "rewards/margins": 0.04117188602685928, + "rewards/rejected": -0.06257417052984238, + "sft_loss": 0.42804569005966187, + "step": 3545 + }, + { + "epoch": 2.84, + "grad_norm": 8.001000670131893, + "learning_rate": 4.319838323396691e-08, + "logits/chosen": -1.2595059871673584, + "logits/rejected": -0.9697495698928833, + "logps/chosen": -0.4693234860897064, + "logps/rejected": -1.4664093255996704, + "loss": 0.5123, + "odds_ratio_loss": 0.27086126804351807, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02346617542207241, + "rewards/margins": 0.04985428601503372, + "rewards/rejected": -0.07332046329975128, + "sft_loss": 0.4693234860897064, + "step": 3550 + }, + { + "epoch": 2.844, + "grad_norm": 15.587957013391982, + "learning_rate": 4.1071322966535487e-08, + "logits/chosen": -0.996113657951355, + "logits/rejected": -0.6731228828430176, + "logps/chosen": -0.2613915801048279, + "logps/rejected": -1.2313883304595947, + "loss": 0.432, + "odds_ratio_loss": 0.12515100836753845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013069577515125275, + "rewards/margins": 0.04849984496831894, + "rewards/rejected": -0.061569422483444214, + "sft_loss": 0.2613915801048279, + "step": 3555 + }, + { + "epoch": 2.848, + "grad_norm": 7.371630683017223, + "learning_rate": 3.8997527136930004e-08, + "logits/chosen": -0.9078682065010071, + "logits/rejected": -0.7578974962234497, + "logps/chosen": -0.3508548438549042, + "logps/rejected": -1.5415148735046387, + "loss": 0.4796, + "odds_ratio_loss": 0.122395358979702, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01754274033010006, + "rewards/margins": 0.059532999992370605, + "rewards/rejected": -0.07707574218511581, + "sft_loss": 0.3508548438549042, + "step": 3560 + }, + { + "epoch": 2.852, + "grad_norm": 8.0726795268241, + "learning_rate": 3.6977040666977546e-08, + "logits/chosen": -1.1534029245376587, + "logits/rejected": -1.2099335193634033, + "logps/chosen": -0.5775087475776672, + "logps/rejected": -1.39055597782135, + "loss": 0.4869, + "odds_ratio_loss": 0.2733613848686218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028875436633825302, + "rewards/margins": 0.04065236449241638, + "rewards/rejected": -0.06952779740095139, + "sft_loss": 0.5775087475776672, + "step": 3565 + }, + { + "epoch": 2.856, + "grad_norm": 11.24865948956023, + "learning_rate": 3.5009907323737826e-08, + "logits/chosen": -0.552277684211731, + "logits/rejected": -1.1681157350540161, + "logps/chosen": -0.6316530108451843, + "logps/rejected": -1.6660516262054443, + "loss": 0.5061, + "odds_ratio_loss": 0.2643664479255676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03158264979720116, + "rewards/margins": 0.05171992629766464, + "rewards/rejected": -0.0833025798201561, + "sft_loss": 0.6316530108451843, + "step": 3570 + }, + { + "epoch": 2.86, + "grad_norm": 8.187980250866474, + "learning_rate": 3.309616971855195e-08, + "logits/chosen": -0.9434603452682495, + "logits/rejected": -1.0135250091552734, + "logps/chosen": -0.23478789627552032, + "logps/rejected": -2.0454790592193604, + "loss": 0.4638, + "odds_ratio_loss": 0.12218568474054337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011739394627511501, + "rewards/margins": 0.09053455293178558, + "rewards/rejected": -0.10227394104003906, + "sft_loss": 0.23478789627552032, + "step": 3575 + }, + { + "epoch": 2.864, + "grad_norm": 10.077811295456423, + "learning_rate": 3.1235869306123766e-08, + "logits/chosen": -1.0300368070602417, + "logits/rejected": -0.8315317034721375, + "logps/chosen": -0.5715829133987427, + "logps/rejected": -1.4090276956558228, + "loss": 0.4626, + "odds_ratio_loss": 0.3393372893333435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.028579145669937134, + "rewards/margins": 0.04187224060297012, + "rewards/rejected": -0.07045139372348785, + "sft_loss": 0.5715829133987427, + "step": 3580 + }, + { + "epoch": 2.868, + "grad_norm": 6.609706143071862, + "learning_rate": 2.9429046383618042e-08, + "logits/chosen": -1.0845788717269897, + "logits/rejected": -1.40879487991333, + "logps/chosen": -0.467085063457489, + "logps/rejected": -1.8588714599609375, + "loss": 0.5266, + "odds_ratio_loss": 0.16859348118305206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02335425093770027, + "rewards/margins": 0.06958932429552078, + "rewards/rejected": -0.09294357895851135, + "sft_loss": 0.467085063457489, + "step": 3585 + }, + { + "epoch": 2.872, + "grad_norm": 4.974574218409692, + "learning_rate": 2.767574008979007e-08, + "logits/chosen": -0.8864003419876099, + "logits/rejected": -1.0464935302734375, + "logps/chosen": -0.5476582050323486, + "logps/rejected": -1.9629188776016235, + "loss": 0.4826, + "odds_ratio_loss": 0.19431143999099731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02738291397690773, + "rewards/margins": 0.07076303660869598, + "rewards/rejected": -0.09814594686031342, + "sft_loss": 0.5476582050323486, + "step": 3590 + }, + { + "epoch": 2.876, + "grad_norm": 5.391207506724364, + "learning_rate": 2.59759884041369e-08, + "logits/chosen": -0.8431358337402344, + "logits/rejected": -0.9085556864738464, + "logps/chosen": -0.26856502890586853, + "logps/rejected": -1.4795863628387451, + "loss": 0.442, + "odds_ratio_loss": 0.1016441136598587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01342825312167406, + "rewards/margins": 0.060551077127456665, + "rewards/rejected": -0.07397932559251785, + "sft_loss": 0.26856502890586853, + "step": 3595 + }, + { + "epoch": 2.88, + "grad_norm": 4.308262440272032, + "learning_rate": 2.4329828146074096e-08, + "logits/chosen": -0.7681323289871216, + "logits/rejected": -0.8165963292121887, + "logps/chosen": -0.3159729242324829, + "logps/rejected": -1.6058391332626343, + "loss": 0.4946, + "odds_ratio_loss": 0.14674882590770721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015798646956682205, + "rewards/margins": 0.06449331343173981, + "rewards/rejected": -0.08029197156429291, + "sft_loss": 0.3159729242324829, + "step": 3600 + }, + { + "epoch": 2.884, + "grad_norm": 4.854650051917074, + "learning_rate": 2.2737294974140013e-08, + "logits/chosen": -0.9245132207870483, + "logits/rejected": -0.9439831972122192, + "logps/chosen": -0.5168892741203308, + "logps/rejected": -1.4722706079483032, + "loss": 0.6122, + "odds_ratio_loss": 0.26270395517349243, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02584446594119072, + "rewards/margins": 0.04776906222105026, + "rewards/rejected": -0.07361352443695068, + "sft_loss": 0.5168892741203308, + "step": 3605 + }, + { + "epoch": 2.888, + "grad_norm": 4.810264838940049, + "learning_rate": 2.1198423385220822e-08, + "logits/chosen": -0.5693929195404053, + "logits/rejected": -0.8137924075126648, + "logps/chosen": -0.5562976598739624, + "logps/rejected": -1.870003342628479, + "loss": 0.4924, + "odds_ratio_loss": 0.1856372058391571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02781488373875618, + "rewards/margins": 0.06568528711795807, + "rewards/rejected": -0.09350016713142395, + "sft_loss": 0.5562976598739624, + "step": 3610 + }, + { + "epoch": 2.892, + "grad_norm": 4.590085289404893, + "learning_rate": 1.9713246713805588e-08, + "logits/chosen": -0.7041809558868408, + "logits/rejected": -0.819964587688446, + "logps/chosen": -0.5815272331237793, + "logps/rejected": -1.5759763717651367, + "loss": 0.4648, + "odds_ratio_loss": 0.21994754672050476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029076358303427696, + "rewards/margins": 0.04972246289253235, + "rewards/rejected": -0.0787988156080246, + "sft_loss": 0.5815272331237793, + "step": 3615 + }, + { + "epoch": 2.896, + "grad_norm": 5.1032023997222655, + "learning_rate": 1.82817971312621e-08, + "logits/chosen": -0.9272521138191223, + "logits/rejected": -0.969031810760498, + "logps/chosen": -0.3067367672920227, + "logps/rejected": -1.5883128643035889, + "loss": 0.466, + "odds_ratio_loss": 0.13033534586429596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01533683855086565, + "rewards/margins": 0.06407880038022995, + "rewards/rejected": -0.07941563427448273, + "sft_loss": 0.3067367672920227, + "step": 3620 + }, + { + "epoch": 2.9, + "grad_norm": 5.133104020591015, + "learning_rate": 1.6904105645142443e-08, + "logits/chosen": -0.6860765218734741, + "logits/rejected": -1.004093885421753, + "logps/chosen": -0.313940167427063, + "logps/rejected": -1.984702706336975, + "loss": 0.4767, + "odds_ratio_loss": 0.05924931913614273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01569700986146927, + "rewards/margins": 0.08353812992572784, + "rewards/rejected": -0.09923513978719711, + "sft_loss": 0.313940167427063, + "step": 3625 + }, + { + "epoch": 2.904, + "grad_norm": 6.289524397383519, + "learning_rate": 1.5580202098509078e-08, + "logits/chosen": -0.6269059181213379, + "logits/rejected": -0.887751579284668, + "logps/chosen": -0.6534531116485596, + "logps/rejected": -1.3601219654083252, + "loss": 0.5362, + "odds_ratio_loss": 0.3122999668121338, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03267265111207962, + "rewards/margins": 0.03533344715833664, + "rewards/rejected": -0.06800609827041626, + "sft_loss": 0.6534531116485596, + "step": 3630 + }, + { + "epoch": 2.908, + "grad_norm": 5.321061246172241, + "learning_rate": 1.4310115169289263e-08, + "logits/chosen": -0.7741699814796448, + "logits/rejected": -0.8883988261222839, + "logps/chosen": -0.3918963074684143, + "logps/rejected": -1.3303663730621338, + "loss": 0.4475, + "odds_ratio_loss": 0.18605419993400574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019594816491007805, + "rewards/margins": 0.04692351073026657, + "rewards/rejected": -0.06651832163333893, + "sft_loss": 0.3918963074684143, + "step": 3635 + }, + { + "epoch": 2.912, + "grad_norm": 5.614615596322008, + "learning_rate": 1.3093872369654148e-08, + "logits/chosen": -0.813896656036377, + "logits/rejected": -0.9681650400161743, + "logps/chosen": -0.318438321352005, + "logps/rejected": -1.9196224212646484, + "loss": 0.5048, + "odds_ratio_loss": 0.14587683975696564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01592191681265831, + "rewards/margins": 0.08005920052528381, + "rewards/rejected": -0.09598111361265182, + "sft_loss": 0.318438321352005, + "step": 3640 + }, + { + "epoch": 2.916, + "grad_norm": 6.909146903510477, + "learning_rate": 1.193150004542204e-08, + "logits/chosen": -0.6660367250442505, + "logits/rejected": -1.0153348445892334, + "logps/chosen": -0.46080127358436584, + "logps/rejected": -1.8787386417388916, + "loss": 0.5142, + "odds_ratio_loss": 0.22567503154277802, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.023040063679218292, + "rewards/margins": 0.07089685648679733, + "rewards/rejected": -0.09393692761659622, + "sft_loss": 0.46080127358436584, + "step": 3645 + }, + { + "epoch": 2.92, + "grad_norm": 7.790549829630172, + "learning_rate": 1.0823023375489128e-08, + "logits/chosen": -0.817104697227478, + "logits/rejected": -0.8277327418327332, + "logps/chosen": -0.6163122057914734, + "logps/rejected": -1.7266231775283813, + "loss": 0.4548, + "odds_ratio_loss": 0.251958429813385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03081561252474785, + "rewards/margins": 0.05551555007696152, + "rewards/rejected": -0.08633115142583847, + "sft_loss": 0.6163122057914734, + "step": 3650 + }, + { + "epoch": 2.924, + "grad_norm": 4.529447800983838, + "learning_rate": 9.76846637128187e-09, + "logits/chosen": -1.1566288471221924, + "logits/rejected": -1.1448895931243896, + "logps/chosen": -0.6995830535888672, + "logps/rejected": -1.5597145557403564, + "loss": 0.4972, + "odds_ratio_loss": 0.30369362235069275, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03497915342450142, + "rewards/margins": 0.043006572872400284, + "rewards/rejected": -0.0779857262969017, + "sft_loss": 0.6995830535888672, + "step": 3655 + }, + { + "epoch": 2.928, + "grad_norm": 5.857785314987744, + "learning_rate": 8.767851876239075e-09, + "logits/chosen": -0.5766550898551941, + "logits/rejected": -0.9048866033554077, + "logps/chosen": -0.43968862295150757, + "logps/rejected": -1.336573839187622, + "loss": 0.4655, + "odds_ratio_loss": 0.20616094768047333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021984433755278587, + "rewards/margins": 0.04484425485134125, + "rewards/rejected": -0.06682869046926498, + "sft_loss": 0.43968862295150757, + "step": 3660 + }, + { + "epoch": 2.932, + "grad_norm": 5.159988160646394, + "learning_rate": 7.821201565316184e-09, + "logits/chosen": -1.0310778617858887, + "logits/rejected": -0.7640330791473389, + "logps/chosen": -0.3315303921699524, + "logps/rejected": -1.6290994882583618, + "loss": 0.4037, + "odds_ratio_loss": 0.10144094377756119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01657651923596859, + "rewards/margins": 0.064878448843956, + "rewards/rejected": -0.08145496994256973, + "sft_loss": 0.3315303921699524, + "step": 3665 + }, + { + "epoch": 2.936, + "grad_norm": 18.981926903930404, + "learning_rate": 6.9285359445145366e-09, + "logits/chosen": -0.8311254382133484, + "logits/rejected": -0.9923003911972046, + "logps/chosen": -0.5326222777366638, + "logps/rejected": -1.6575853824615479, + "loss": 0.4691, + "odds_ratio_loss": 0.2388612926006317, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02663111314177513, + "rewards/margins": 0.056248150765895844, + "rewards/rejected": -0.08287926018238068, + "sft_loss": 0.5326222777366638, + "step": 3670 + }, + { + "epoch": 2.94, + "grad_norm": 4.65539553385166, + "learning_rate": 6.089874350439507e-09, + "logits/chosen": -0.8828865885734558, + "logits/rejected": -1.2277076244354248, + "logps/chosen": -0.44972023367881775, + "logps/rejected": -1.8230899572372437, + "loss": 0.4531, + "odds_ratio_loss": 0.0991957038640976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022486012428998947, + "rewards/margins": 0.06866848468780518, + "rewards/rejected": -0.09115449339151382, + "sft_loss": 0.44972023367881775, + "step": 3675 + }, + { + "epoch": 2.944, + "grad_norm": 7.036020184151846, + "learning_rate": 5.305234949880001e-09, + "logits/chosen": -0.8388309478759766, + "logits/rejected": -0.784132719039917, + "logps/chosen": -0.5297240018844604, + "logps/rejected": -1.3611842393875122, + "loss": 0.4606, + "odds_ratio_loss": 0.23987647891044617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02648620307445526, + "rewards/margins": 0.04157300293445587, + "rewards/rejected": -0.06805920600891113, + "sft_loss": 0.5297240018844604, + "step": 3680 + }, + { + "epoch": 2.948, + "grad_norm": 8.642237728292242, + "learning_rate": 4.57463473941544e-09, + "logits/chosen": -1.135202407836914, + "logits/rejected": -0.9565531611442566, + "logps/chosen": -0.48526114225387573, + "logps/rejected": -1.3105952739715576, + "loss": 0.5389, + "odds_ratio_loss": 0.2372247278690338, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024263057857751846, + "rewards/margins": 0.04126670956611633, + "rewards/rejected": -0.06552976369857788, + "sft_loss": 0.48526114225387573, + "step": 3685 + }, + { + "epoch": 2.952, + "grad_norm": 4.384021145091059, + "learning_rate": 3.8980895450474455e-09, + "logits/chosen": -0.7918862104415894, + "logits/rejected": -1.0930888652801514, + "logps/chosen": -0.47447633743286133, + "logps/rejected": -1.5464060306549072, + "loss": 0.4936, + "odds_ratio_loss": 0.19870418310165405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023723818361759186, + "rewards/margins": 0.053596485406160355, + "rewards/rejected": -0.07732030004262924, + "sft_loss": 0.47447633743286133, + "step": 3690 + }, + { + "epoch": 2.956, + "grad_norm": 5.606541344203649, + "learning_rate": 3.275614021857609e-09, + "logits/chosen": -1.0226024389266968, + "logits/rejected": -1.014661192893982, + "logps/chosen": -0.8057661056518555, + "logps/rejected": -1.672181487083435, + "loss": 0.5466, + "odds_ratio_loss": 0.33504363894462585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.040288303047418594, + "rewards/margins": 0.04332076758146286, + "rewards/rejected": -0.08360908180475235, + "sft_loss": 0.8057661056518555, + "step": 3695 + }, + { + "epoch": 2.96, + "grad_norm": 6.6670138700200035, + "learning_rate": 2.7072216536885855e-09, + "logits/chosen": -0.6843828558921814, + "logits/rejected": -0.8663504719734192, + "logps/chosen": -0.4747343063354492, + "logps/rejected": -1.3466534614562988, + "loss": 0.3528, + "odds_ratio_loss": 0.2356620728969574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02373671717941761, + "rewards/margins": 0.04359595477581024, + "rewards/rejected": -0.0673326700925827, + "sft_loss": 0.4747343063354492, + "step": 3700 + }, + { + "epoch": 2.964, + "grad_norm": 7.141276758716209, + "learning_rate": 2.192924752854042e-09, + "logits/chosen": -0.7393054962158203, + "logits/rejected": -1.508094072341919, + "logps/chosen": -0.49731236696243286, + "logps/rejected": -1.8641046285629272, + "loss": 0.4864, + "odds_ratio_loss": 0.12721852958202362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02486562170088291, + "rewards/margins": 0.06833961606025696, + "rewards/rejected": -0.09320523589849472, + "sft_loss": 0.49731236696243286, + "step": 3705 + }, + { + "epoch": 2.968, + "grad_norm": 5.266798743387709, + "learning_rate": 1.7327344598702667e-09, + "logits/chosen": -1.0665310621261597, + "logits/rejected": -0.858269989490509, + "logps/chosen": -0.536304235458374, + "logps/rejected": -1.4411725997924805, + "loss": 0.4227, + "odds_ratio_loss": 0.328524649143219, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.026815209537744522, + "rewards/margins": 0.04524341970682144, + "rewards/rejected": -0.07205863296985626, + "sft_loss": 0.536304235458374, + "step": 3710 + }, + { + "epoch": 2.972, + "grad_norm": 6.750661652998483, + "learning_rate": 1.3266607432155243e-09, + "logits/chosen": -0.6614164113998413, + "logits/rejected": -0.7348114848136902, + "logps/chosen": -0.39280936121940613, + "logps/rejected": -1.426816463470459, + "loss": 0.3934, + "odds_ratio_loss": 0.1887359321117401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019640469923615456, + "rewards/margins": 0.05170035362243652, + "rewards/rejected": -0.07134082168340683, + "sft_loss": 0.39280936121940613, + "step": 3715 + }, + { + "epoch": 2.976, + "grad_norm": 6.207932687783427, + "learning_rate": 9.747123991141193e-10, + "logits/chosen": -0.7722820043563843, + "logits/rejected": -0.7826868295669556, + "logps/chosen": -0.45992159843444824, + "logps/rejected": -1.4466217756271362, + "loss": 0.5124, + "odds_ratio_loss": 0.19496043026447296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02299608290195465, + "rewards/margins": 0.04933501034975052, + "rewards/rejected": -0.07233108580112457, + "sft_loss": 0.45992159843444824, + "step": 3720 + }, + { + "epoch": 2.98, + "grad_norm": 5.007278723928615, + "learning_rate": 6.768970513457151e-10, + "logits/chosen": -0.8529748916625977, + "logits/rejected": -1.0278675556182861, + "logps/chosen": -0.3574032783508301, + "logps/rejected": -1.736302137374878, + "loss": 0.4984, + "odds_ratio_loss": 0.11382756382226944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017870163545012474, + "rewards/margins": 0.06894494593143463, + "rewards/rejected": -0.08681510388851166, + "sft_loss": 0.3574032783508301, + "step": 3725 + }, + { + "epoch": 2.984, + "grad_norm": 5.762199382824231, + "learning_rate": 4.332211510807427e-10, + "logits/chosen": -0.7780022025108337, + "logits/rejected": -1.1421865224838257, + "logps/chosen": -0.7099230885505676, + "logps/rejected": -1.55299711227417, + "loss": 0.546, + "odds_ratio_loss": 0.3306809067726135, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03549615666270256, + "rewards/margins": 0.042153697460889816, + "rewards/rejected": -0.07764985412359238, + "sft_loss": 0.7099230885505676, + "step": 3730 + }, + { + "epoch": 2.988, + "grad_norm": 4.682934344503231, + "learning_rate": 2.43689976739403e-10, + "logits/chosen": -0.8084946870803833, + "logits/rejected": -1.319000482559204, + "logps/chosen": -0.4620262682437897, + "logps/rejected": -1.2795708179473877, + "loss": 0.4644, + "odds_ratio_loss": 0.23649486899375916, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.023101316764950752, + "rewards/margins": 0.04087722301483154, + "rewards/rejected": -0.06397853791713715, + "sft_loss": 0.4620262682437897, + "step": 3735 + }, + { + "epoch": 2.992, + "grad_norm": 6.157538833749046, + "learning_rate": 1.0830763387897902e-10, + "logits/chosen": -0.8642560839653015, + "logits/rejected": -1.1312994956970215, + "logps/chosen": -0.3675539791584015, + "logps/rejected": -1.7602100372314453, + "loss": 0.5132, + "odds_ratio_loss": 0.23475000262260437, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.018377700820565224, + "rewards/margins": 0.06963280588388443, + "rewards/rejected": -0.0880105048418045, + "sft_loss": 0.3675539791584015, + "step": 3740 + }, + { + "epoch": 2.996, + "grad_norm": 5.903707133419837, + "learning_rate": 2.7077055103075233e-11, + "logits/chosen": -0.7607343792915344, + "logits/rejected": -1.1049892902374268, + "logps/chosen": -0.606769323348999, + "logps/rejected": -1.8843456506729126, + "loss": 0.4721, + "odds_ratio_loss": 0.20161142945289612, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03033846616744995, + "rewards/margins": 0.06387881934642792, + "rewards/rejected": -0.09421730041503906, + "sft_loss": 0.606769323348999, + "step": 3745 + }, + { + "epoch": 3.0, + "grad_norm": 4.814195311056904, + "learning_rate": 0.0, + "logits/chosen": -1.1040897369384766, + "logits/rejected": -1.2379401922225952, + "logps/chosen": -0.4253109097480774, + "logps/rejected": -1.3982548713684082, + "loss": 0.4811, + "odds_ratio_loss": 0.22021734714508057, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.02126554399728775, + "rewards/margins": 0.0486472025513649, + "rewards/rejected": -0.06991274654865265, + "sft_loss": 0.4253109097480774, + "step": 3750 + }, + { + "epoch": 3.0, + "step": 3750, + "total_flos": 126401337753600.0, + "train_loss": 0.8602390615145366, + "train_runtime": 16444.8476, + "train_samples_per_second": 3.649, + "train_steps_per_second": 0.228 + } + ], + "logging_steps": 5, + "max_steps": 3750, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100.0, + "total_flos": 126401337753600.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}