{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 1.8281873727150824, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06036572530865669, "logits/rejected": 0.15200476348400116, "logps/chosen": -1.7157948017120361, "logps/rejected": -1.889754056930542, "loss": 0.1875, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7157948017120361, "rewards/margins": 0.17395934462547302, "rewards/rejected": -1.889754056930542, "sft_loss": 1.4684072732925415, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 1.3261214622264328, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.015721673145890236, "logits/rejected": 0.14082524180412292, "logps/chosen": -1.803401231765747, "logps/rejected": -1.8462854623794556, "loss": 0.1915, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.803401231765747, "rewards/margins": 0.042884208261966705, "rewards/rejected": -1.8462854623794556, "sft_loss": 1.5086901187896729, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 1.6869277012824744, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.0386616587638855, "logits/rejected": 0.061269234865903854, "logps/chosen": -1.6346614360809326, "logps/rejected": -1.7652347087860107, "loss": 0.209, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6346614360809326, "rewards/margins": 0.130573108792305, "rewards/rejected": -1.7652347087860107, "sft_loss": 1.5001634359359741, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 2.184541215763592, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.044674623757600784, "logits/rejected": 0.04231274500489235, "logps/chosen": -1.7240028381347656, "logps/rejected": -1.8060672283172607, "loss": 0.2066, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7240028381347656, "rewards/margins": 0.0820644274353981, "rewards/rejected": -1.8060672283172607, "sft_loss": 1.5000752210617065, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 2.2885509048614607, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.061998527497053146, "logits/rejected": 0.023830315098166466, "logps/chosen": -1.869637131690979, "logps/rejected": -1.7783664464950562, "loss": 0.2291, "rewards/accuracies": 0.375, "rewards/chosen": -1.869637131690979, "rewards/margins": -0.09127076715230942, "rewards/rejected": -1.7783664464950562, "sft_loss": 1.545493245124817, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 1.6851104456657149, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.0781058818101883, "logits/rejected": 0.018343383446335793, "logps/chosen": -1.9102665185928345, "logps/rejected": -1.833251714706421, "loss": 0.1924, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9102665185928345, "rewards/margins": -0.07701461762189865, "rewards/rejected": -1.833251714706421, "sft_loss": 1.6474205255508423, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 2.1116446404871723, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.06157956272363663, "logits/rejected": 0.10128965228796005, "logps/chosen": -1.847612738609314, "logps/rejected": -1.9992172718048096, "loss": 0.2024, "rewards/accuracies": 0.5, "rewards/chosen": -1.847612738609314, "rewards/margins": 0.1516043245792389, "rewards/rejected": -1.9992172718048096, "sft_loss": 1.5620036125183105, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 1.606280602217098, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.05405878275632858, "logits/rejected": 0.2346314936876297, "logps/chosen": -1.8864805698394775, "logps/rejected": -1.7479203939437866, "loss": 0.2067, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8864805698394775, "rewards/margins": -0.13856001198291779, "rewards/rejected": -1.7479203939437866, "sft_loss": 1.5202641487121582, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 2.0142313565096983, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.020334195345640182, "logits/rejected": 0.22058598697185516, "logps/chosen": -1.8413622379302979, "logps/rejected": -1.8763911724090576, "loss": 0.1998, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8413622379302979, "rewards/margins": 0.03502892702817917, "rewards/rejected": -1.8763911724090576, "sft_loss": 1.5375398397445679, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 2.4004776884800254, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.048713017255067825, "logits/rejected": 0.1055837869644165, "logps/chosen": -1.9055683612823486, "logps/rejected": -1.7839202880859375, "loss": 0.2037, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.9055683612823486, "rewards/margins": -0.12164795398712158, "rewards/rejected": -1.7839202880859375, "sft_loss": 1.585984468460083, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 1.9065344756371017, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.1268903762102127, "logits/rejected": 0.09180887043476105, "logps/chosen": -1.8452562093734741, "logps/rejected": -1.8789564371109009, "loss": 0.1967, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8452562093734741, "rewards/margins": 0.03370007127523422, "rewards/rejected": -1.8789564371109009, "sft_loss": 1.5881474018096924, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 1.9726561828800409, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.09329613298177719, "logits/rejected": 0.1027790755033493, "logps/chosen": -1.8032081127166748, "logps/rejected": -1.9089362621307373, "loss": 0.1881, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8032081127166748, "rewards/margins": 0.10572835057973862, "rewards/rejected": -1.9089362621307373, "sft_loss": 1.5484545230865479, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 1.4158996749388524, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.026859384030103683, "logits/rejected": 0.12450633198022842, "logps/chosen": -1.6532108783721924, "logps/rejected": -1.7873064279556274, "loss": 0.1998, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6532108783721924, "rewards/margins": 0.13409557938575745, "rewards/rejected": -1.7873064279556274, "sft_loss": 1.4820306301116943, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 2.7550451891140657, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.07876388728618622, "logits/rejected": 0.07389497011899948, "logps/chosen": -1.78818678855896, "logps/rejected": -1.8362791538238525, "loss": 0.2083, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -1.78818678855896, "rewards/margins": 0.04809259623289108, "rewards/rejected": -1.8362791538238525, "sft_loss": 1.6427921056747437, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 1.6257388034299363, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.05901254341006279, "logits/rejected": 0.12799496948719025, "logps/chosen": -1.81658935546875, "logps/rejected": -2.0846848487854004, "loss": 0.1872, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.81658935546875, "rewards/margins": 0.26809555292129517, "rewards/rejected": -2.0846848487854004, "sft_loss": 1.582075595855713, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 1.6945858724148233, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.014201399870216846, "logits/rejected": 0.12179889529943466, "logps/chosen": -1.7666809558868408, "logps/rejected": -1.7983417510986328, "loss": 0.2031, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.7666809558868408, "rewards/margins": 0.03166085481643677, "rewards/rejected": -1.7983417510986328, "sft_loss": 1.5483434200286865, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 1.4872099012995739, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.14098253846168518, "logits/rejected": 0.11290383338928223, "logps/chosen": -1.849591612815857, "logps/rejected": -2.038990020751953, "loss": 0.1918, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.849591612815857, "rewards/margins": 0.18939858675003052, "rewards/rejected": -2.038990020751953, "sft_loss": 1.5175174474716187, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 1.6072015338976098, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.07844671607017517, "logits/rejected": 0.04320339113473892, "logps/chosen": -1.8240934610366821, "logps/rejected": -1.8337669372558594, "loss": 0.2062, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8240934610366821, "rewards/margins": 0.009673514403402805, "rewards/rejected": -1.8337669372558594, "sft_loss": 1.4806925058364868, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 1.1704935247873707, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.07325179874897003, "logits/rejected": 0.08035645633935928, "logps/chosen": -1.9085460901260376, "logps/rejected": -1.9914019107818604, "loss": 0.194, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9085460901260376, "rewards/margins": 0.08285579085350037, "rewards/rejected": -1.9914019107818604, "sft_loss": 1.562260627746582, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 1.485734092812147, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.051618821918964386, "logits/rejected": 0.014512482099235058, "logps/chosen": -1.7798315286636353, "logps/rejected": -1.8910728693008423, "loss": 0.1933, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7798315286636353, "rewards/margins": 0.11124144494533539, "rewards/rejected": -1.8910728693008423, "sft_loss": 1.5264718532562256, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 1.2246905118137938, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.06890567392110825, "logits/rejected": 0.09699970483779907, "logps/chosen": -1.75466787815094, "logps/rejected": -1.9174325466156006, "loss": 0.1861, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.75466787815094, "rewards/margins": 0.16276490688323975, "rewards/rejected": -1.9174325466156006, "sft_loss": 1.4836585521697998, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 1.3983137398048286, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.014802386984229088, "logits/rejected": 0.11312691867351532, "logps/chosen": -1.8402084112167358, "logps/rejected": -1.9002002477645874, "loss": 0.2021, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8402084112167358, "rewards/margins": 0.059991706162691116, "rewards/rejected": -1.9002002477645874, "sft_loss": 1.541327714920044, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 1.542976037363866, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.044870324432849884, "logits/rejected": 0.259618878364563, "logps/chosen": -1.8326823711395264, "logps/rejected": -2.162672281265259, "loss": 0.1796, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8326823711395264, "rewards/margins": 0.32998955249786377, "rewards/rejected": -2.162672281265259, "sft_loss": 1.6590197086334229, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 1.121394950130821, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.05395837500691414, "logits/rejected": 0.12947091460227966, "logps/chosen": -1.975813627243042, "logps/rejected": -2.1206367015838623, "loss": 0.1781, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.975813627243042, "rewards/margins": 0.14482299983501434, "rewards/rejected": -2.1206367015838623, "sft_loss": 1.6822599172592163, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 1.738390891126455, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.05913634970784187, "logits/rejected": 0.077244333922863, "logps/chosen": -1.8747756481170654, "logps/rejected": -1.791603446006775, "loss": 0.211, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8747756481170654, "rewards/margins": -0.0831720232963562, "rewards/rejected": -1.791603446006775, "sft_loss": 1.5983225107192993, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 1.2172011134571397, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.07068384438753128, "logits/rejected": 0.21463127434253693, "logps/chosen": -1.9429035186767578, "logps/rejected": -2.06502103805542, "loss": 0.1788, "rewards/accuracies": 0.46875, "rewards/chosen": -1.9429035186767578, "rewards/margins": 0.12211757898330688, "rewards/rejected": -2.06502103805542, "sft_loss": 1.6812289953231812, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 1.4439255764123966, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.017868880182504654, "logits/rejected": 0.10886181890964508, "logps/chosen": -2.0224013328552246, "logps/rejected": -1.997601866722107, "loss": 0.191, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.0224013328552246, "rewards/margins": -0.02479952946305275, "rewards/rejected": -1.997601866722107, "sft_loss": 1.6454761028289795, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 1.9215015654493501, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.02103680931031704, "logits/rejected": 0.1541782170534134, "logps/chosen": -1.9884147644042969, "logps/rejected": -2.2401504516601562, "loss": 0.1721, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9884147644042969, "rewards/margins": 0.25173547863960266, "rewards/rejected": -2.2401504516601562, "sft_loss": 1.6845868825912476, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 1.2117294815525885, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.007556894328445196, "logits/rejected": 0.1537446826696396, "logps/chosen": -1.9553453922271729, "logps/rejected": -2.1021084785461426, "loss": 0.1804, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.9553453922271729, "rewards/margins": 0.14676345884799957, "rewards/rejected": -2.1021084785461426, "sft_loss": 1.6186004877090454, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 1.4324321514957685, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.04060187563300133, "logits/rejected": 0.13013625144958496, "logps/chosen": -1.9446437358856201, "logps/rejected": -1.9469516277313232, "loss": 0.1945, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9446437358856201, "rewards/margins": 0.002307900693267584, "rewards/rejected": -1.9469516277313232, "sft_loss": 1.4851183891296387, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 1.410377603703526, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.052264392375946045, "logits/rejected": -0.0019641772378236055, "logps/chosen": -2.0606913566589355, "logps/rejected": -2.109433650970459, "loss": 0.1869, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.0606913566589355, "rewards/margins": 0.048742227256298065, "rewards/rejected": -2.109433650970459, "sft_loss": 1.6540358066558838, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 1.4434488798396297, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.1261916607618332, "logits/rejected": 0.024200741201639175, "logps/chosen": -2.2658863067626953, "logps/rejected": -2.241018533706665, "loss": 0.1891, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.2658863067626953, "rewards/margins": -0.024867888540029526, "rewards/rejected": -2.241018533706665, "sft_loss": 1.7461020946502686, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 1.339027685977903, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.02971971035003662, "logits/rejected": 0.15595880150794983, "logps/chosen": -2.002155065536499, "logps/rejected": -2.307286024093628, "loss": 0.1813, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.002155065536499, "rewards/margins": 0.3051307797431946, "rewards/rejected": -2.307286024093628, "sft_loss": 1.5998609066009521, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 1.4548486638159581, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.05048539489507675, "logits/rejected": 0.00913523230701685, "logps/chosen": -2.278296947479248, "logps/rejected": -2.2565479278564453, "loss": 0.1819, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.278296947479248, "rewards/margins": -0.021749010309576988, "rewards/rejected": -2.2565479278564453, "sft_loss": 1.7058801651000977, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 1.3337472174712208, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.07039393484592438, "logits/rejected": 0.07238060235977173, "logps/chosen": -2.1607205867767334, "logps/rejected": -2.224153995513916, "loss": 0.1957, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.1607205867767334, "rewards/margins": 0.06343330442905426, "rewards/rejected": -2.224153995513916, "sft_loss": 1.7343257665634155, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 1.0106930560920084, "learning_rate": 3.2085561497326203e-07, "logits/chosen": 0.08963946998119354, "logits/rejected": 0.09271235764026642, "logps/chosen": -2.2524635791778564, "logps/rejected": -2.2211055755615234, "loss": 0.1764, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.2524635791778564, "rewards/margins": -0.03135796636343002, "rewards/rejected": -2.2211055755615234, "sft_loss": 1.715368628501892, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 1.3427419510653615, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.09551338851451874, "logits/rejected": 0.0052245319820940495, "logps/chosen": -2.188047170639038, "logps/rejected": -2.277782917022705, "loss": 0.1871, "rewards/accuracies": 0.5, "rewards/chosen": -2.188047170639038, "rewards/margins": 0.08973531424999237, "rewards/rejected": -2.277782917022705, "sft_loss": 1.6850707530975342, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 1.301877363775769, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.01349000446498394, "logits/rejected": 0.12422996759414673, "logps/chosen": -2.699043035507202, "logps/rejected": -2.5563666820526123, "loss": 0.1669, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.699043035507202, "rewards/margins": -0.14267598092556, "rewards/rejected": -2.5563666820526123, "sft_loss": 1.9406452178955078, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 1.086372047902067, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.08391423523426056, "logits/rejected": 0.25889503955841064, "logps/chosen": -2.082772731781006, "logps/rejected": -2.147904872894287, "loss": 0.1832, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.082772731781006, "rewards/margins": 0.06513235718011856, "rewards/rejected": -2.147904872894287, "sft_loss": 1.587127685546875, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 1.0784548518531638, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.045540787279605865, "logits/rejected": 0.10638017952442169, "logps/chosen": -2.497420310974121, "logps/rejected": -2.2494847774505615, "loss": 0.1807, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.497420310974121, "rewards/margins": -0.24793526530265808, "rewards/rejected": -2.2494847774505615, "sft_loss": 1.8374172449111938, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 1.2841841345947111, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.010509648360311985, "logits/rejected": 0.15566954016685486, "logps/chosen": -2.733661413192749, "logps/rejected": -2.442354202270508, "loss": 0.167, "rewards/accuracies": 0.5, "rewards/chosen": -2.733661413192749, "rewards/margins": -0.29130715131759644, "rewards/rejected": -2.442354202270508, "sft_loss": 1.7773916721343994, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 1.4421977705539422, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.11572986841201782, "logits/rejected": 0.0990462675690651, "logps/chosen": -2.5400633811950684, "logps/rejected": -2.898448944091797, "loss": 0.1456, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.5400633811950684, "rewards/margins": 0.3583856523036957, "rewards/rejected": -2.898448944091797, "sft_loss": 1.8469970226287842, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 1.506528371919276, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.11365026235580444, "logits/rejected": 0.1664947271347046, "logps/chosen": -2.3687429428100586, "logps/rejected": -2.4806742668151855, "loss": 0.1593, "rewards/accuracies": 0.5625, "rewards/chosen": -2.3687429428100586, "rewards/margins": 0.11193136870861053, "rewards/rejected": -2.4806742668151855, "sft_loss": 1.8512741327285767, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 1.4541076330685845, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.11326134204864502, "logits/rejected": 0.23379270732402802, "logps/chosen": -2.7988970279693604, "logps/rejected": -3.2029125690460205, "loss": 0.1425, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.7988970279693604, "rewards/margins": 0.40401554107666016, "rewards/rejected": -3.2029125690460205, "sft_loss": 2.0269691944122314, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 1.4193672051877066, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.07774774730205536, "logits/rejected": 0.11152136325836182, "logps/chosen": -2.676217555999756, "logps/rejected": -2.757519483566284, "loss": 0.1467, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.676217555999756, "rewards/margins": 0.08130187541246414, "rewards/rejected": -2.757519483566284, "sft_loss": 1.8078447580337524, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 1.4655988363625254, "learning_rate": 4.09982174688057e-07, "logits/chosen": 0.04009638726711273, "logits/rejected": 0.13166067004203796, "logps/chosen": -3.2687041759490967, "logps/rejected": -3.2889716625213623, "loss": 0.1471, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.2687041759490967, "rewards/margins": 0.020267415791749954, "rewards/rejected": -3.2889716625213623, "sft_loss": 1.96890127658844, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 1.278825876101477, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.03988439589738846, "logits/rejected": 0.2017788141965866, "logps/chosen": -3.112320899963379, "logps/rejected": -3.4062983989715576, "loss": 0.1284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.112320899963379, "rewards/margins": 0.29397743940353394, "rewards/rejected": -3.4062983989715576, "sft_loss": 1.9640719890594482, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 1.063516787032469, "learning_rate": 4.27807486631016e-07, "logits/chosen": 0.04343884810805321, "logits/rejected": 0.190804585814476, "logps/chosen": -3.3919873237609863, "logps/rejected": -3.4484734535217285, "loss": 0.1373, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.3919873237609863, "rewards/margins": 0.05648590251803398, "rewards/rejected": -3.4484734535217285, "sft_loss": 2.2613186836242676, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 1.246833107441079, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.08618511259555817, "logits/rejected": 0.22703281044960022, "logps/chosen": -3.113327980041504, "logps/rejected": -3.773731231689453, "loss": 0.1335, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.113327980041504, "rewards/margins": 0.6604034900665283, "rewards/rejected": -3.773731231689453, "sft_loss": 2.246939182281494, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 1.550906778055372, "learning_rate": 4.4563279857397503e-07, "logits/chosen": 0.008564489893615246, "logits/rejected": 0.19193480908870697, "logps/chosen": -4.5200276374816895, "logps/rejected": -4.392758369445801, "loss": 0.1367, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.5200276374816895, "rewards/margins": -0.12726902961730957, "rewards/rejected": -4.392758369445801, "sft_loss": 2.322331190109253, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 1.0017344608881895, "learning_rate": 4.545454545454545e-07, "logits/chosen": 0.027799557894468307, "logits/rejected": 0.1999567598104477, "logps/chosen": -3.2353675365448, "logps/rejected": -3.8548312187194824, "loss": 0.1278, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.2353675365448, "rewards/margins": 0.6194636821746826, "rewards/rejected": -3.8548312187194824, "sft_loss": 2.0332932472229004, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 1.1722811528835204, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.13097763061523438, "logits/rejected": -0.003967789001762867, "logps/chosen": -4.378108024597168, "logps/rejected": -3.8029494285583496, "loss": 0.1114, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.378108024597168, "rewards/margins": -0.5751584768295288, "rewards/rejected": -3.8029494285583496, "sft_loss": 2.4523839950561523, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 0.8598376845470599, "learning_rate": 4.723707664884135e-07, "logits/chosen": 0.04610385373234749, "logits/rejected": 0.14736303687095642, "logps/chosen": -5.376759052276611, "logps/rejected": -4.316792964935303, "loss": 0.1299, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -5.376759052276611, "rewards/margins": -1.0599651336669922, "rewards/rejected": -4.316792964935303, "sft_loss": 3.3849892616271973, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 0.659100840868797, "learning_rate": 4.81283422459893e-07, "logits/chosen": 0.05620427802205086, "logits/rejected": 0.22956471145153046, "logps/chosen": -4.417783737182617, "logps/rejected": -4.7367353439331055, "loss": 0.1148, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.417783737182617, "rewards/margins": 0.3189517557621002, "rewards/rejected": -4.7367353439331055, "sft_loss": 2.5751936435699463, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 0.944667145643773, "learning_rate": 4.901960784313725e-07, "logits/chosen": 0.1821393221616745, "logits/rejected": 0.29745563864707947, "logps/chosen": -5.001216888427734, "logps/rejected": -5.648111820220947, "loss": 0.1278, "rewards/accuracies": 0.59375, "rewards/chosen": -5.001216888427734, "rewards/margins": 0.646894633769989, "rewards/rejected": -5.648111820220947, "sft_loss": 3.029219150543213, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 0.62958069078015, "learning_rate": 4.99108734402852e-07, "logits/chosen": 0.06474583595991135, "logits/rejected": 0.26421093940734863, "logps/chosen": -5.785727500915527, "logps/rejected": -5.513718128204346, "loss": 0.111, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -5.785727500915527, "rewards/margins": -0.2720094919204712, "rewards/rejected": -5.513718128204346, "sft_loss": 3.7501449584960938, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 0.7707223965415297, "learning_rate": 5.080213903743315e-07, "logits/chosen": 0.07362186163663864, "logits/rejected": 0.24137923121452332, "logps/chosen": -5.315040111541748, "logps/rejected": -5.234798431396484, "loss": 0.1171, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -5.315040111541748, "rewards/margins": -0.08024124801158905, "rewards/rejected": -5.234798431396484, "sft_loss": 2.7596487998962402, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 0.43821944195019613, "learning_rate": 5.169340463458111e-07, "logits/chosen": 0.06858251988887787, "logits/rejected": 0.4367496967315674, "logps/chosen": -4.310262203216553, "logps/rejected": -5.49105167388916, "loss": 0.0853, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.310262203216553, "rewards/margins": 1.1807892322540283, "rewards/rejected": -5.49105167388916, "sft_loss": 2.8444995880126953, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 0.534101197488112, "learning_rate": 5.258467023172905e-07, "logits/chosen": 0.11962026357650757, "logits/rejected": 0.191305473446846, "logps/chosen": -5.814216613769531, "logps/rejected": -5.191082954406738, "loss": 0.1136, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -5.814216613769531, "rewards/margins": -0.6231337785720825, "rewards/rejected": -5.191082954406738, "sft_loss": 3.5382983684539795, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 0.7825104689335549, "learning_rate": 5.347593582887701e-07, "logits/chosen": 0.09059463441371918, "logits/rejected": 0.3025878965854645, "logps/chosen": -6.152596950531006, "logps/rejected": -6.278104782104492, "loss": 0.1085, "rewards/accuracies": 0.59375, "rewards/chosen": -6.152596950531006, "rewards/margins": 0.12550795078277588, "rewards/rejected": -6.278104782104492, "sft_loss": 3.1514294147491455, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 0.7203056918661761, "learning_rate": 5.436720142602496e-07, "logits/chosen": 0.15295490622520447, "logits/rejected": 0.23979385197162628, "logps/chosen": -5.263061046600342, "logps/rejected": -5.25927734375, "loss": 0.1103, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -5.263061046600342, "rewards/margins": -0.003783750580623746, "rewards/rejected": -5.25927734375, "sft_loss": 3.4355788230895996, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 0.48118015251670226, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.04459068924188614, "logits/rejected": 0.07622343301773071, "logps/chosen": -6.208052158355713, "logps/rejected": -6.269270420074463, "loss": 0.099, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.208052158355713, "rewards/margins": 0.061218809336423874, "rewards/rejected": -6.269270420074463, "sft_loss": 3.9879536628723145, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 0.5130287780473973, "learning_rate": 5.614973262032086e-07, "logits/chosen": 0.1535053700208664, "logits/rejected": 0.35032719373703003, "logps/chosen": -5.845208644866943, "logps/rejected": -5.75873327255249, "loss": 0.0997, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -5.845208644866943, "rewards/margins": -0.08647508919239044, "rewards/rejected": -5.75873327255249, "sft_loss": 3.6303811073303223, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 0.894923831994968, "learning_rate": 5.70409982174688e-07, "logits/chosen": 0.07708640396595001, "logits/rejected": 0.23186373710632324, "logps/chosen": -5.41533088684082, "logps/rejected": -5.087583065032959, "loss": 0.1109, "rewards/accuracies": 0.53125, "rewards/chosen": -5.41533088684082, "rewards/margins": -0.32774776220321655, "rewards/rejected": -5.087583065032959, "sft_loss": 3.4817776679992676, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 0.6602942214082753, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.001248963177204132, "logits/rejected": 0.1617216020822525, "logps/chosen": -6.740938663482666, "logps/rejected": -7.048771858215332, "loss": 0.0954, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -6.740938663482666, "rewards/margins": 0.30783215165138245, "rewards/rejected": -7.048771858215332, "sft_loss": 3.575293779373169, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 2.0172656785700736, "learning_rate": 5.88235294117647e-07, "logits/chosen": 0.08358623087406158, "logits/rejected": 0.2782168984413147, "logps/chosen": -4.833988666534424, "logps/rejected": -6.85193395614624, "loss": 0.0975, "rewards/accuracies": 0.59375, "rewards/chosen": -4.833988666534424, "rewards/margins": 2.0179455280303955, "rewards/rejected": -6.85193395614624, "sft_loss": 3.4718756675720215, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 1.559804678106283, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.021460741758346558, "logits/rejected": 0.18631306290626526, "logps/chosen": -7.669114589691162, "logps/rejected": -7.59591007232666, "loss": 0.0806, "rewards/accuracies": 0.53125, "rewards/chosen": -7.669114589691162, "rewards/margins": -0.07320408523082733, "rewards/rejected": -7.59591007232666, "sft_loss": 4.107626438140869, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 3.600551249598769, "learning_rate": 6.060606060606061e-07, "logits/chosen": 0.0006690695881843567, "logits/rejected": 0.20934703946113586, "logps/chosen": -7.419314384460449, "logps/rejected": -7.942813873291016, "loss": 0.0665, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -7.419314384460449, "rewards/margins": 0.5234988331794739, "rewards/rejected": -7.942813873291016, "sft_loss": 5.120023250579834, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 3.051133199140238, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.11144141852855682, "logits/rejected": 0.18817485868930817, "logps/chosen": -7.210999965667725, "logps/rejected": -7.412866115570068, "loss": 0.0692, "rewards/accuracies": 0.5, "rewards/chosen": -7.210999965667725, "rewards/margins": 0.20186543464660645, "rewards/rejected": -7.412866115570068, "sft_loss": 5.065989971160889, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 2.8496280707460535, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.15946859121322632, "logits/rejected": 0.3110240697860718, "logps/chosen": -7.066911220550537, "logps/rejected": -6.517140865325928, "loss": 0.0691, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -7.066911220550537, "rewards/margins": -0.5497702956199646, "rewards/rejected": -6.517140865325928, "sft_loss": 4.872412204742432, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 1.301936365963939, "learning_rate": 6.327985739750445e-07, "logits/chosen": 0.07622610032558441, "logits/rejected": 0.35027334094047546, "logps/chosen": -6.583775997161865, "logps/rejected": -6.518033027648926, "loss": 0.0641, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -6.583775997161865, "rewards/margins": -0.06574312597513199, "rewards/rejected": -6.518033027648926, "sft_loss": 5.07278299331665, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 2.4767014699621797, "learning_rate": 6.417112299465241e-07, "logits/chosen": 0.1512000560760498, "logits/rejected": 0.24605056643486023, "logps/chosen": -5.883225440979004, "logps/rejected": -6.043740272521973, "loss": 0.0636, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -5.883225440979004, "rewards/margins": 0.1605152040719986, "rewards/rejected": -6.043740272521973, "sft_loss": 4.487223148345947, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 2.5470960757144923, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.1374529004096985, "logits/rejected": 0.23795314133167267, "logps/chosen": -5.908981800079346, "logps/rejected": -5.590696811676025, "loss": 0.0649, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -5.908981800079346, "rewards/margins": -0.3182848393917084, "rewards/rejected": -5.590696811676025, "sft_loss": 4.756215572357178, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 1.213454043776727, "learning_rate": 6.59536541889483e-07, "logits/chosen": 0.21467037498950958, "logits/rejected": 0.3665415644645691, "logps/chosen": -5.693985462188721, "logps/rejected": -5.656804084777832, "loss": 0.07, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -5.693985462188721, "rewards/margins": -0.037180982530117035, "rewards/rejected": -5.656804084777832, "sft_loss": 4.857564926147461, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 1.4920484603725634, "learning_rate": 6.684491978609626e-07, "logits/chosen": 0.01740451343357563, "logits/rejected": 0.19893547892570496, "logps/chosen": -5.846138954162598, "logps/rejected": -6.645530700683594, "loss": 0.0605, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.846138954162598, "rewards/margins": 0.7993919253349304, "rewards/rejected": -6.645530700683594, "sft_loss": 4.869112968444824, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 5.257896059031984, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.005385799799114466, "logits/rejected": 0.09482140839099884, "logps/chosen": -5.7430901527404785, "logps/rejected": -6.084932804107666, "loss": 0.0593, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -5.7430901527404785, "rewards/margins": 0.3418427109718323, "rewards/rejected": -6.084932804107666, "sft_loss": 4.753641128540039, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 1.2399413549639557, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.023536410182714462, "logits/rejected": 0.0942329689860344, "logps/chosen": -4.997963905334473, "logps/rejected": -5.332228183746338, "loss": 0.0568, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.997963905334473, "rewards/margins": 0.33426347374916077, "rewards/rejected": -5.332228183746338, "sft_loss": 4.493335723876953, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 1.1537665725561919, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.06892560422420502, "logits/rejected": 0.3140993118286133, "logps/chosen": -5.147521018981934, "logps/rejected": -5.322199821472168, "loss": 0.061, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -5.147521018981934, "rewards/margins": 0.17467857897281647, "rewards/rejected": -5.322199821472168, "sft_loss": 4.530567646026611, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 1.9743142113650651, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.0006369724869728088, "logits/rejected": 0.22242622077465057, "logps/chosen": -5.135289192199707, "logps/rejected": -5.416306972503662, "loss": 0.061, "rewards/accuracies": 0.59375, "rewards/chosen": -5.135289192199707, "rewards/margins": 0.2810174524784088, "rewards/rejected": -5.416306972503662, "sft_loss": 4.482254981994629, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 12.527675546846213, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.0736534595489502, "logits/rejected": 0.21511869132518768, "logps/chosen": -5.2505717277526855, "logps/rejected": -5.617761135101318, "loss": 0.0563, "rewards/accuracies": 0.5625, "rewards/chosen": -5.2505717277526855, "rewards/margins": 0.3671889901161194, "rewards/rejected": -5.617761135101318, "sft_loss": 4.406624794006348, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.5372059345245361, "eval_logits/rejected": 0.6512095928192139, "eval_logps/chosen": -5.745402812957764, "eval_logps/rejected": -6.024581432342529, "eval_loss": 0.05729706957936287, "eval_rewards/accuracies": 0.5445103645324707, "eval_rewards/chosen": -5.745402812957764, "eval_rewards/margins": 0.27917909622192383, "eval_rewards/rejected": -6.024581432342529, "eval_runtime": 44.2063, "eval_samples_per_second": 30.426, "eval_sft_loss": 4.8351593017578125, "eval_steps_per_second": 7.623, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 3.482450373185867, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.12481657415628433, "logits/rejected": 0.24873094260692596, "logps/chosen": -5.8155927658081055, "logps/rejected": -6.231393814086914, "loss": 0.0616, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -5.8155927658081055, "rewards/margins": 0.41580113768577576, "rewards/rejected": -6.231393814086914, "sft_loss": 5.256585121154785, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 1.7016472746253801, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.06780445575714111, "logits/rejected": 0.23064598441123962, "logps/chosen": -5.035314559936523, "logps/rejected": -5.458802700042725, "loss": 0.0577, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -5.035314559936523, "rewards/margins": 0.4234878420829773, "rewards/rejected": -5.458802700042725, "sft_loss": 4.297842502593994, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 2.6585148249522117, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.00957435928285122, "logits/rejected": 0.0749988928437233, "logps/chosen": -5.311757564544678, "logps/rejected": -5.30417537689209, "loss": 0.0591, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -5.311757564544678, "rewards/margins": -0.007582643534988165, "rewards/rejected": -5.30417537689209, "sft_loss": 4.704192161560059, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 0.9301731458001584, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.13617399334907532, "logits/rejected": 0.18578466773033142, "logps/chosen": -4.937038898468018, "logps/rejected": -5.141440391540527, "loss": 0.0566, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.937038898468018, "rewards/margins": 0.2044021338224411, "rewards/rejected": -5.141440391540527, "sft_loss": 4.429440021514893, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 3.770695836704577, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.023957863450050354, "logits/rejected": 0.19839075207710266, "logps/chosen": -5.234984397888184, "logps/rejected": -5.377315521240234, "loss": 0.056, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -5.234984397888184, "rewards/margins": 0.14233139157295227, "rewards/rejected": -5.377315521240234, "sft_loss": 4.4770612716674805, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 3.156316119175797, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.19132229685783386, "logits/rejected": 0.07280747592449188, "logps/chosen": -5.19472599029541, "logps/rejected": -5.726254940032959, "loss": 0.0559, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.19472599029541, "rewards/margins": 0.5315293073654175, "rewards/rejected": -5.726254940032959, "sft_loss": 4.581809043884277, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 2.4555124360440823, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.2125493586063385, "logits/rejected": -0.1108977347612381, "logps/chosen": -5.088704586029053, "logps/rejected": -5.045218467712402, "loss": 0.056, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -5.088704586029053, "rewards/margins": -0.043486569076776505, "rewards/rejected": -5.045218467712402, "sft_loss": 4.416959285736084, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 2.1633149663163147, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.12068028748035431, "logits/rejected": 0.02915067970752716, "logps/chosen": -5.012939929962158, "logps/rejected": -5.2914557456970215, "loss": 0.0575, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -5.012939929962158, "rewards/margins": 0.2785159945487976, "rewards/rejected": -5.2914557456970215, "sft_loss": 4.793717384338379, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 2.234715768390998, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.013025635853409767, "logits/rejected": 0.12180577218532562, "logps/chosen": -4.833644866943359, "logps/rejected": -5.177438259124756, "loss": 0.0553, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.833644866943359, "rewards/margins": 0.34379321336746216, "rewards/rejected": -5.177438259124756, "sft_loss": 4.411333084106445, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 1.531338508012952, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.06976038217544556, "logits/rejected": 0.22492310404777527, "logps/chosen": -4.981315612792969, "logps/rejected": -5.218815326690674, "loss": 0.0552, "rewards/accuracies": 0.5625, "rewards/chosen": -4.981315612792969, "rewards/margins": 0.2375001460313797, "rewards/rejected": -5.218815326690674, "sft_loss": 4.548120021820068, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 1.4188376332791823, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.060088079422712326, "logits/rejected": 0.1696506291627884, "logps/chosen": -5.040231704711914, "logps/rejected": -5.448383331298828, "loss": 0.0552, "rewards/accuracies": 0.59375, "rewards/chosen": -5.040231704711914, "rewards/margins": 0.4081522524356842, "rewards/rejected": -5.448383331298828, "sft_loss": 4.615390300750732, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 0.8274857650787919, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.16978515684604645, "logits/rejected": 0.03128058463335037, "logps/chosen": -4.783569812774658, "logps/rejected": -4.997244358062744, "loss": 0.0578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.783569812774658, "rewards/margins": 0.2136746346950531, "rewards/rejected": -4.997244358062744, "sft_loss": 4.518199443817139, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 1.1140822228366611, "learning_rate": 8.288770053475936e-07, "logits/chosen": -0.03868691250681877, "logits/rejected": 0.047852348536252975, "logps/chosen": -4.818562030792236, "logps/rejected": -5.142088413238525, "loss": 0.0569, "rewards/accuracies": 0.5625, "rewards/chosen": -4.818562030792236, "rewards/margins": 0.3235262334346771, "rewards/rejected": -5.142088413238525, "sft_loss": 4.523900508880615, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 1.0831999599710989, "learning_rate": 8.37789661319073e-07, "logits/chosen": -0.041154105216264725, "logits/rejected": -0.16716539859771729, "logps/chosen": -5.088252067565918, "logps/rejected": -5.1696577072143555, "loss": 0.0583, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -5.088252067565918, "rewards/margins": 0.08140526711940765, "rewards/rejected": -5.1696577072143555, "sft_loss": 4.869572639465332, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 1.0609505577965346, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.41082948446273804, "logits/rejected": -0.15013936161994934, "logps/chosen": -4.697094917297363, "logps/rejected": -5.290956974029541, "loss": 0.0541, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.697094917297363, "rewards/margins": 0.5938615202903748, "rewards/rejected": -5.290956974029541, "sft_loss": 4.516491889953613, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 1.0628021961229914, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.310746967792511, "logits/rejected": -0.06656353920698166, "logps/chosen": -4.562114715576172, "logps/rejected": -4.903138160705566, "loss": 0.0551, "rewards/accuracies": 0.625, "rewards/chosen": -4.562114715576172, "rewards/margins": 0.3410232663154602, "rewards/rejected": -4.903138160705566, "sft_loss": 4.2797112464904785, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 1.6375184623561623, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.07968901097774506, "logits/rejected": 0.021457534283399582, "logps/chosen": -4.943936824798584, "logps/rejected": -5.136715888977051, "loss": 0.0556, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.943936824798584, "rewards/margins": 0.192779541015625, "rewards/rejected": -5.136715888977051, "sft_loss": 4.519631385803223, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 2.9319822414945302, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.015240554697811604, "logits/rejected": 0.0742521733045578, "logps/chosen": -4.907310485839844, "logps/rejected": -5.221567153930664, "loss": 0.0555, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.907310485839844, "rewards/margins": 0.31425637006759644, "rewards/rejected": -5.221567153930664, "sft_loss": 4.523907661437988, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 1.0031833756740196, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.14841726422309875, "logits/rejected": -0.13759180903434753, "logps/chosen": -4.806763648986816, "logps/rejected": -4.975509166717529, "loss": 0.057, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.806763648986816, "rewards/margins": 0.1687450259923935, "rewards/rejected": -4.975509166717529, "sft_loss": 4.490440845489502, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 0.7041566026153324, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.28130513429641724, "logits/rejected": -0.15879981219768524, "logps/chosen": -4.831118583679199, "logps/rejected": -5.1757378578186035, "loss": 0.0545, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.831118583679199, "rewards/margins": 0.34461960196495056, "rewards/rejected": -5.1757378578186035, "sft_loss": 4.525076389312744, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 0.6823847318552707, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.36276552081108093, "logits/rejected": -0.19994986057281494, "logps/chosen": -4.660523891448975, "logps/rejected": -4.733887195587158, "loss": 0.0567, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.660523891448975, "rewards/margins": 0.07336314022541046, "rewards/rejected": -4.733887195587158, "sft_loss": 4.3608832359313965, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 1.1486157742033887, "learning_rate": 9.09090909090909e-07, "logits/chosen": -0.1318567544221878, "logits/rejected": -0.0561264343559742, "logps/chosen": -4.911768913269043, "logps/rejected": -5.159637451171875, "loss": 0.0556, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.911768913269043, "rewards/margins": 0.24786880612373352, "rewards/rejected": -5.159637451171875, "sft_loss": 4.5656256675720215, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 1.0872297295675921, "learning_rate": 9.180035650623885e-07, "logits/chosen": -0.1332496702671051, "logits/rejected": 0.011695345863699913, "logps/chosen": -4.879103183746338, "logps/rejected": -5.090020179748535, "loss": 0.0553, "rewards/accuracies": 0.59375, "rewards/chosen": -4.879103183746338, "rewards/margins": 0.21091759204864502, "rewards/rejected": -5.090020179748535, "sft_loss": 4.485103607177734, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 0.5462026811962317, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.168059304356575, "logits/rejected": 0.027810195460915565, "logps/chosen": -4.999017238616943, "logps/rejected": -5.254492282867432, "loss": 0.056, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.999017238616943, "rewards/margins": 0.255475252866745, "rewards/rejected": -5.254492282867432, "sft_loss": 4.662232398986816, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 0.8810552591658657, "learning_rate": 9.358288770053476e-07, "logits/chosen": -0.09420142322778702, "logits/rejected": 0.04737439751625061, "logps/chosen": -4.771590709686279, "logps/rejected": -5.034438133239746, "loss": 0.0554, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.771590709686279, "rewards/margins": 0.26284775137901306, "rewards/rejected": -5.034438133239746, "sft_loss": 4.294004917144775, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 0.6477536515618159, "learning_rate": 9.44741532976827e-07, "logits/chosen": -0.17829468846321106, "logits/rejected": -0.11911626160144806, "logps/chosen": -4.77829122543335, "logps/rejected": -5.030237197875977, "loss": 0.0557, "rewards/accuracies": 0.59375, "rewards/chosen": -4.77829122543335, "rewards/margins": 0.251945436000824, "rewards/rejected": -5.030237197875977, "sft_loss": 4.557742118835449, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 0.7898205513417741, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.3551715910434723, "logits/rejected": 0.03357185795903206, "logps/chosen": -4.911800384521484, "logps/rejected": -5.16738224029541, "loss": 0.0551, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.911800384521484, "rewards/margins": 0.25558188557624817, "rewards/rejected": -5.16738224029541, "sft_loss": 4.680028438568115, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 0.670936398200854, "learning_rate": 9.62566844919786e-07, "logits/chosen": -0.2937626242637634, "logits/rejected": -0.15557271242141724, "logps/chosen": -4.6293230056762695, "logps/rejected": -4.820884704589844, "loss": 0.057, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.6293230056762695, "rewards/margins": 0.1915610134601593, "rewards/rejected": -4.820884704589844, "sft_loss": 4.376486778259277, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 0.6738876951312873, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.3255738317966461, "logits/rejected": -0.04851512238383293, "logps/chosen": -5.104310035705566, "logps/rejected": -5.27868127822876, "loss": 0.055, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -5.104310035705566, "rewards/margins": 0.1743713766336441, "rewards/rejected": -5.27868127822876, "sft_loss": 4.744593620300293, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 2.1219096999806935, "learning_rate": 9.80392156862745e-07, "logits/chosen": -0.24361948668956757, "logits/rejected": -0.17571952939033508, "logps/chosen": -4.638392448425293, "logps/rejected": -4.891458988189697, "loss": 0.0551, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.638392448425293, "rewards/margins": 0.25306588411331177, "rewards/rejected": -4.891458988189697, "sft_loss": 4.369989395141602, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 0.8293066005958115, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.2626475989818573, "logits/rejected": -0.07505345344543457, "logps/chosen": -4.890946865081787, "logps/rejected": -5.142005443572998, "loss": 0.0558, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.890946865081787, "rewards/margins": 0.251058965921402, "rewards/rejected": -5.142005443572998, "sft_loss": 4.533967971801758, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 0.6161840677717771, "learning_rate": 9.98217468805704e-07, "logits/chosen": -0.17927469313144684, "logits/rejected": -0.12405016273260117, "logps/chosen": -4.814286231994629, "logps/rejected": -5.052570343017578, "loss": 0.0553, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.814286231994629, "rewards/margins": 0.2382839024066925, "rewards/rejected": -5.052570343017578, "sft_loss": 4.54550313949585, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 1.1352561464004647, "learning_rate": 9.999984476788462e-07, "logits/chosen": -0.26553431153297424, "logits/rejected": -0.14662417769432068, "logps/chosen": -4.51534366607666, "logps/rejected": -4.847154140472412, "loss": 0.0553, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.51534366607666, "rewards/margins": 0.3318101763725281, "rewards/rejected": -4.847154140472412, "sft_loss": 4.323355197906494, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 0.8650373367494478, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.2807005047798157, "logits/rejected": 0.030847817659378052, "logps/chosen": -4.749510765075684, "logps/rejected": -5.021355628967285, "loss": 0.0544, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.749510765075684, "rewards/margins": 0.2718445062637329, "rewards/rejected": -5.021355628967285, "sft_loss": 4.40215539932251, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 0.825097287674202, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.2592643201351166, "logits/rejected": -0.25759127736091614, "logps/chosen": -4.920784950256348, "logps/rejected": -5.223763465881348, "loss": 0.0561, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.920784950256348, "rewards/margins": 0.302978515625, "rewards/rejected": -5.223763465881348, "sft_loss": 4.647104263305664, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 0.6235577350217273, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.3262614607810974, "logits/rejected": -0.0489344522356987, "logps/chosen": -4.673770904541016, "logps/rejected": -5.098756790161133, "loss": 0.0551, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.673770904541016, "rewards/margins": 0.42498579621315, "rewards/rejected": -5.098756790161133, "sft_loss": 4.476217746734619, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 0.740239920335634, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.3703567683696747, "logits/rejected": -0.2420303076505661, "logps/chosen": -4.8338303565979, "logps/rejected": -5.058407783508301, "loss": 0.0545, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.8338303565979, "rewards/margins": 0.22457735240459442, "rewards/rejected": -5.058407783508301, "sft_loss": 4.397377014160156, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 0.7683188347707338, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.3597511649131775, "logits/rejected": -0.2020716369152069, "logps/chosen": -4.76621675491333, "logps/rejected": -4.9511847496032715, "loss": 0.0545, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.76621675491333, "rewards/margins": 0.1849685162305832, "rewards/rejected": -4.9511847496032715, "sft_loss": 4.394301891326904, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 0.6797907024001023, "learning_rate": 9.998878489314937e-07, "logits/chosen": -0.24955956637859344, "logits/rejected": -0.025243768468499184, "logps/chosen": -4.7654948234558105, "logps/rejected": -5.149549961090088, "loss": 0.055, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.7654948234558105, "rewards/margins": 0.384054571390152, "rewards/rejected": -5.149549961090088, "sft_loss": 4.515573024749756, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 1.6465081989407226, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.2952226996421814, "logits/rejected": -0.09677503257989883, "logps/chosen": -4.662662506103516, "logps/rejected": -4.912552833557129, "loss": 0.0548, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.662662506103516, "rewards/margins": 0.2498904913663864, "rewards/rejected": -4.912552833557129, "sft_loss": 4.407889366149902, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 1.2335372988895321, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.2944491505622864, "logits/rejected": -0.18662339448928833, "logps/chosen": -4.888004302978516, "logps/rejected": -5.157434940338135, "loss": 0.0557, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.888004302978516, "rewards/margins": 0.2694306969642639, "rewards/rejected": -5.157434940338135, "sft_loss": 4.625906467437744, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 1.3244626800898112, "learning_rate": 9.997670727736379e-07, "logits/chosen": -0.22974035143852234, "logits/rejected": 0.04643644392490387, "logps/chosen": -4.5863938331604, "logps/rejected": -4.8641676902771, "loss": 0.0552, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.5863938331604, "rewards/margins": 0.27777382731437683, "rewards/rejected": -4.8641676902771, "sft_loss": 4.253309726715088, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 0.5535449490055506, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.1554524004459381, "logits/rejected": -0.05251539871096611, "logps/chosen": -4.844357490539551, "logps/rejected": -5.277754783630371, "loss": 0.0542, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.844357490539551, "rewards/margins": 0.4333969056606293, "rewards/rejected": -5.277754783630371, "sft_loss": 4.583644866943359, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 1.856104039503372, "learning_rate": 9.996623109724173e-07, "logits/chosen": -0.08841142803430557, "logits/rejected": 0.03570820018649101, "logps/chosen": -4.618399620056152, "logps/rejected": -4.836398601531982, "loss": 0.0545, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.618399620056152, "rewards/margins": 0.21799834072589874, "rewards/rejected": -4.836398601531982, "sft_loss": 4.201367378234863, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 1.072425302622145, "learning_rate": 9.996026582170488e-07, "logits/chosen": -0.10054608434438705, "logits/rejected": 0.11284986883401871, "logps/chosen": -4.950314521789551, "logps/rejected": -5.514838218688965, "loss": 0.0538, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.950314521789551, "rewards/margins": 0.5645238161087036, "rewards/rejected": -5.514838218688965, "sft_loss": 4.627020835876465, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 0.5453150161564525, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.2218894064426422, "logits/rejected": -0.04821163788437843, "logps/chosen": -4.688741207122803, "logps/rejected": -5.064606189727783, "loss": 0.054, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.688741207122803, "rewards/margins": 0.3758644163608551, "rewards/rejected": -5.064606189727783, "sft_loss": 4.479228973388672, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 1.591931090937354, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.3113294839859009, "logits/rejected": 0.02164360322058201, "logps/chosen": -4.350485801696777, "logps/rejected": -4.76249361038208, "loss": 0.0541, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.350485801696777, "rewards/margins": 0.41200733184814453, "rewards/rejected": -4.76249361038208, "sft_loss": 4.126279354095459, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 0.7619272038790944, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.26695743203163147, "logits/rejected": 0.017921606078743935, "logps/chosen": -4.865670204162598, "logps/rejected": -5.221670627593994, "loss": 0.0548, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.865670204162598, "rewards/margins": 0.3560001254081726, "rewards/rejected": -5.221670627593994, "sft_loss": 4.485020637512207, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 1.1133191585439717, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.23877230286598206, "logits/rejected": -0.1729026436805725, "logps/chosen": -4.773536682128906, "logps/rejected": -5.113900661468506, "loss": 0.0545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.773536682128906, "rewards/margins": 0.3403640389442444, "rewards/rejected": -5.113900661468506, "sft_loss": 4.5196027755737305, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 0.4838798037479677, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.180585116147995, "logits/rejected": -0.07650501281023026, "logps/chosen": -4.630809783935547, "logps/rejected": -5.039240837097168, "loss": 0.0547, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.630809783935547, "rewards/margins": 0.4084309935569763, "rewards/rejected": -5.039240837097168, "sft_loss": 4.399133682250977, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 0.7799457617182248, "learning_rate": 9.991429751418696e-07, "logits/chosen": -0.17215296626091003, "logits/rejected": -0.14876854419708252, "logps/chosen": -4.6841607093811035, "logps/rejected": -5.022492408752441, "loss": 0.055, "rewards/accuracies": 0.59375, "rewards/chosen": -4.6841607093811035, "rewards/margins": 0.33833178877830505, "rewards/rejected": -5.022492408752441, "sft_loss": 4.436097621917725, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 0.7840566134412431, "learning_rate": 9.99049407143074e-07, "logits/chosen": -0.31777292490005493, "logits/rejected": -0.12179327011108398, "logps/chosen": -4.860751628875732, "logps/rejected": -4.970133304595947, "loss": 0.0563, "rewards/accuracies": 0.59375, "rewards/chosen": -4.860751628875732, "rewards/margins": 0.10938136279582977, "rewards/rejected": -4.970133304595947, "sft_loss": 4.530073642730713, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 0.9380159419430601, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.3308666944503784, "logits/rejected": -0.13465926051139832, "logps/chosen": -4.901317596435547, "logps/rejected": -5.171170234680176, "loss": 0.0547, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.901317596435547, "rewards/margins": 0.2698523998260498, "rewards/rejected": -5.171170234680176, "sft_loss": 4.707381725311279, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 1.5810380912043656, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.39763838052749634, "logits/rejected": -0.135453462600708, "logps/chosen": -4.548798561096191, "logps/rejected": -4.874077320098877, "loss": 0.055, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.548798561096191, "rewards/margins": 0.32527926564216614, "rewards/rejected": -4.874077320098877, "sft_loss": 4.289718151092529, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 0.7655930328404614, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.384846031665802, "logits/rejected": -0.2601172924041748, "logps/chosen": -4.761540412902832, "logps/rejected": -5.05265998840332, "loss": 0.0553, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.761540412902832, "rewards/margins": 0.29111921787261963, "rewards/rejected": -5.05265998840332, "sft_loss": 4.518373012542725, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 0.935266037635124, "learning_rate": 9.986267271350631e-07, "logits/chosen": -0.34337860345840454, "logits/rejected": -0.14144375920295715, "logps/chosen": -4.823574542999268, "logps/rejected": -4.984274864196777, "loss": 0.0562, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.823574542999268, "rewards/margins": 0.16069956123828888, "rewards/rejected": -4.984274864196777, "sft_loss": 4.562578201293945, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 0.5615696193518576, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.37192243337631226, "logits/rejected": -0.1106470599770546, "logps/chosen": -4.682257652282715, "logps/rejected": -5.030256271362305, "loss": 0.0535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.682257652282715, "rewards/margins": 0.347998708486557, "rewards/rejected": -5.030256271362305, "sft_loss": 4.349669933319092, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 1.4194745294621536, "learning_rate": 9.983863568406428e-07, "logits/chosen": -0.2427678108215332, "logits/rejected": -0.1963764727115631, "logps/chosen": -4.648349761962891, "logps/rejected": -4.934103488922119, "loss": 0.0552, "rewards/accuracies": 0.5625, "rewards/chosen": -4.648349761962891, "rewards/margins": 0.2857532203197479, "rewards/rejected": -4.934103488922119, "sft_loss": 4.321009635925293, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 0.8866737162217125, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.24945712089538574, "logits/rejected": -0.12678943574428558, "logps/chosen": -4.920681476593018, "logps/rejected": -5.2684712409973145, "loss": 0.0548, "rewards/accuracies": 0.59375, "rewards/chosen": -4.920681476593018, "rewards/margins": 0.3477899730205536, "rewards/rejected": -5.2684712409973145, "sft_loss": 4.634685516357422, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 0.8355538491486008, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.42114168405532837, "logits/rejected": -0.21116065979003906, "logps/chosen": -4.6978044509887695, "logps/rejected": -5.096659183502197, "loss": 0.0538, "rewards/accuracies": 0.625, "rewards/chosen": -4.6978044509887695, "rewards/margins": 0.39885538816452026, "rewards/rejected": -5.096659183502197, "sft_loss": 4.44577169418335, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 1.2264999824699427, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.30539703369140625, "logits/rejected": -0.014449876733124256, "logps/chosen": -4.504247188568115, "logps/rejected": -4.868443489074707, "loss": 0.0542, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.504247188568115, "rewards/margins": 0.3641965389251709, "rewards/rejected": -4.868443489074707, "sft_loss": 4.2012739181518555, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 0.8131909730707197, "learning_rate": 9.9784760231197e-07, "logits/chosen": -0.19957628846168518, "logits/rejected": -0.03655420243740082, "logps/chosen": -4.724609375, "logps/rejected": -5.090517997741699, "loss": 0.0546, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.724609375, "rewards/margins": 0.36590883135795593, "rewards/rejected": -5.090517997741699, "sft_loss": 4.364399433135986, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 0.4792104571469811, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.282558798789978, "logits/rejected": -0.0352824404835701, "logps/chosen": -4.992825508117676, "logps/rejected": -5.230543613433838, "loss": 0.0549, "rewards/accuracies": 0.5625, "rewards/chosen": -4.992825508117676, "rewards/margins": 0.23771806061267853, "rewards/rejected": -5.230543613433838, "sft_loss": 4.564342021942139, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 0.8052355596151808, "learning_rate": 9.97549238985662e-07, "logits/chosen": -0.20845279097557068, "logits/rejected": 0.08761949837207794, "logps/chosen": -4.708680152893066, "logps/rejected": -5.044828414916992, "loss": 0.0537, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.708680152893066, "rewards/margins": 0.3361477851867676, "rewards/rejected": -5.044828414916992, "sft_loss": 4.384383678436279, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 1.0124337160016392, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.3460480570793152, "logits/rejected": -0.10860247910022736, "logps/chosen": -4.608756065368652, "logps/rejected": -4.939072132110596, "loss": 0.054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.608756065368652, "rewards/margins": 0.33031561970710754, "rewards/rejected": -4.939072132110596, "sft_loss": 4.346649169921875, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 1.285300410149873, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.33846408128738403, "logits/rejected": -0.14779387414455414, "logps/chosen": -4.764631748199463, "logps/rejected": -5.01378059387207, "loss": 0.054, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.764631748199463, "rewards/margins": 0.24914869666099548, "rewards/rejected": -5.01378059387207, "sft_loss": 4.407692909240723, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 0.440695821147554, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.3593650460243225, "logits/rejected": -0.1944659948348999, "logps/chosen": -4.49599552154541, "logps/rejected": -4.8249711990356445, "loss": 0.0543, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.49599552154541, "rewards/margins": 0.3289756178855896, "rewards/rejected": -4.8249711990356445, "sft_loss": 4.26392936706543, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 1.0055567756075858, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.32162588834762573, "logits/rejected": -0.07832889258861542, "logps/chosen": -4.858660697937012, "logps/rejected": -5.215462684631348, "loss": 0.0528, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.858660697937012, "rewards/margins": 0.3568021357059479, "rewards/rejected": -5.215462684631348, "sft_loss": 4.455192565917969, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 1.1135407344664665, "learning_rate": 9.967188816117726e-07, "logits/chosen": -0.23036710917949677, "logits/rejected": -0.05882059410214424, "logps/chosen": -4.685460567474365, "logps/rejected": -5.1761980056762695, "loss": 0.0552, "rewards/accuracies": 0.625, "rewards/chosen": -4.685460567474365, "rewards/margins": 0.4907374382019043, "rewards/rejected": -5.1761980056762695, "sft_loss": 4.325514316558838, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 0.49216293494483937, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.24461698532104492, "logits/rejected": 0.05077634006738663, "logps/chosen": -4.6956305503845215, "logps/rejected": -5.1042890548706055, "loss": 0.054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.6956305503845215, "rewards/margins": 0.40865880250930786, "rewards/rejected": -5.1042890548706055, "sft_loss": 4.272480010986328, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 0.7538062835686686, "learning_rate": 9.963529928746533e-07, "logits/chosen": -0.13682588934898376, "logits/rejected": 0.09025086462497711, "logps/chosen": -4.984869003295898, "logps/rejected": -5.451809406280518, "loss": 0.0538, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.984869003295898, "rewards/margins": 0.46694087982177734, "rewards/rejected": -5.451809406280518, "sft_loss": 4.631129264831543, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 0.8804951698404386, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.23067674040794373, "logits/rejected": -0.09110057353973389, "logps/chosen": -4.5784502029418945, "logps/rejected": -4.966479778289795, "loss": 0.0545, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.5784502029418945, "rewards/margins": 0.3880303204059601, "rewards/rejected": -4.966479778289795, "sft_loss": 4.248000144958496, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 0.9443455201928435, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.13407650589942932, "logits/rejected": -0.029496919363737106, "logps/chosen": -4.822766304016113, "logps/rejected": -5.181981086730957, "loss": 0.0547, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.822766304016113, "rewards/margins": 0.35921525955200195, "rewards/rejected": -5.181981086730957, "sft_loss": 4.538924217224121, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 0.615353021347065, "learning_rate": 9.957680473564493e-07, "logits/chosen": -0.10198304802179337, "logits/rejected": 0.09545397013425827, "logps/chosen": -4.82249641418457, "logps/rejected": -5.266402244567871, "loss": 0.0536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.82249641418457, "rewards/margins": 0.44390565156936646, "rewards/rejected": -5.266402244567871, "sft_loss": 4.410745143890381, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 0.5160785518479237, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.21874277293682098, "logits/rejected": 0.046989113092422485, "logps/chosen": -4.701724052429199, "logps/rejected": -5.032949447631836, "loss": 0.0543, "rewards/accuracies": 0.59375, "rewards/chosen": -4.701724052429199, "rewards/margins": 0.33122485876083374, "rewards/rejected": -5.032949447631836, "sft_loss": 4.272125244140625, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 0.5850128791360416, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.38717955350875854, "logits/rejected": -0.12629468739032745, "logps/chosen": -4.6548614501953125, "logps/rejected": -4.998035907745361, "loss": 0.0537, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.6548614501953125, "rewards/margins": 0.34317439794540405, "rewards/rejected": -4.998035907745361, "sft_loss": 4.339907646179199, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 0.8364339689134905, "learning_rate": 9.951398126243133e-07, "logits/chosen": -0.2304932177066803, "logits/rejected": -0.08832928538322449, "logps/chosen": -4.8688859939575195, "logps/rejected": -5.244830131530762, "loss": 0.054, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.8688859939575195, "rewards/margins": 0.37594443559646606, "rewards/rejected": -5.244830131530762, "sft_loss": 4.49772310256958, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 0.5878980222924488, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.2880062460899353, "logits/rejected": -0.13873964548110962, "logps/chosen": -4.635272026062012, "logps/rejected": -5.012479782104492, "loss": 0.0529, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.635272026062012, "rewards/margins": 0.37720683217048645, "rewards/rejected": -5.012479782104492, "sft_loss": 4.257678985595703, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 0.44959914295393133, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.32320109009742737, "logits/rejected": -0.042316682636737823, "logps/chosen": -4.49575138092041, "logps/rejected": -5.073681354522705, "loss": 0.0531, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.49575138092041, "rewards/margins": 0.5779297947883606, "rewards/rejected": -5.073681354522705, "sft_loss": 4.208576202392578, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 0.7144637825313324, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.17591574788093567, "logits/rejected": -0.07191745191812515, "logps/chosen": -4.874433517456055, "logps/rejected": -5.258727073669434, "loss": 0.0533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.874433517456055, "rewards/margins": 0.38429397344589233, "rewards/rejected": -5.258727073669434, "sft_loss": 4.46193265914917, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.164407417178154, "eval_logits/rejected": 0.2938738465309143, "eval_logps/chosen": -4.695379734039307, "eval_logps/rejected": -5.077708721160889, "eval_loss": 0.052394524216651917, "eval_rewards/accuracies": 0.6157270073890686, "eval_rewards/chosen": -4.695379734039307, "eval_rewards/margins": 0.3823291063308716, "eval_rewards/rejected": -5.077708721160889, "eval_runtime": 43.3284, "eval_samples_per_second": 31.042, "eval_sft_loss": 4.234010696411133, "eval_steps_per_second": 7.778, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 0.5270049225438456, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.3387565016746521, "logits/rejected": -0.06523511558771133, "logps/chosen": -4.50264835357666, "logps/rejected": -5.163118362426758, "loss": 0.0521, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.50264835357666, "rewards/margins": 0.6604706048965454, "rewards/rejected": -5.163118362426758, "sft_loss": 4.194108009338379, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 0.523313066003409, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.2286236733198166, "logits/rejected": -0.14836743474006653, "logps/chosen": -4.752197742462158, "logps/rejected": -5.03873348236084, "loss": 0.0558, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.752197742462158, "rewards/margins": 0.28653571009635925, "rewards/rejected": -5.03873348236084, "sft_loss": 4.4536638259887695, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 0.7802337043968409, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.20195576548576355, "logits/rejected": -0.011835225857794285, "logps/chosen": -4.779173851013184, "logps/rejected": -5.223197937011719, "loss": 0.053, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.779173851013184, "rewards/margins": 0.4440239369869232, "rewards/rejected": -5.223197937011719, "sft_loss": 4.434736728668213, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 0.8536421132761614, "learning_rate": 9.935058998485896e-07, "logits/chosen": -0.14910456538200378, "logits/rejected": -0.1316080391407013, "logps/chosen": -4.600480556488037, "logps/rejected": -5.0173115730285645, "loss": 0.0537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.600480556488037, "rewards/margins": 0.4168310761451721, "rewards/rejected": -5.0173115730285645, "sft_loss": 4.223259925842285, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 0.9865375408237489, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.3155757784843445, "logits/rejected": -0.13682445883750916, "logps/chosen": -4.623326301574707, "logps/rejected": -4.900997161865234, "loss": 0.0547, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.623326301574707, "rewards/margins": 0.27767083048820496, "rewards/rejected": -4.900997161865234, "sft_loss": 4.3923540115356445, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 0.7418103771522546, "learning_rate": 9.929959405734711e-07, "logits/chosen": -0.18393446505069733, "logits/rejected": 0.002429759595543146, "logps/chosen": -4.682926177978516, "logps/rejected": -5.0224385261535645, "loss": 0.0543, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.682926177978516, "rewards/margins": 0.33951207995414734, "rewards/rejected": -5.0224385261535645, "sft_loss": 4.493451118469238, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 0.6110152314789478, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.24163761734962463, "logits/rejected": -0.09300851076841354, "logps/chosen": -4.854322910308838, "logps/rejected": -5.115272045135498, "loss": 0.0547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.854322910308838, "rewards/margins": 0.2609490752220154, "rewards/rejected": -5.115272045135498, "sft_loss": 4.595727920532227, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 0.7110654327268647, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.332312673330307, "logits/rejected": -0.02082272246479988, "logps/chosen": -4.5603837966918945, "logps/rejected": -4.936267852783203, "loss": 0.0542, "rewards/accuracies": 0.59375, "rewards/chosen": -4.5603837966918945, "rewards/margins": 0.3758838176727295, "rewards/rejected": -4.936267852783203, "sft_loss": 4.269760608673096, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 0.6116882986091782, "learning_rate": 9.92195135269533e-07, "logits/chosen": -0.26116663217544556, "logits/rejected": -0.2063770592212677, "logps/chosen": -4.58666467666626, "logps/rejected": -4.9023871421813965, "loss": 0.0536, "rewards/accuracies": 0.625, "rewards/chosen": -4.58666467666626, "rewards/margins": 0.3157220482826233, "rewards/rejected": -4.9023871421813965, "sft_loss": 4.305259704589844, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 0.5190734803088864, "learning_rate": 9.919186461100574e-07, "logits/chosen": -0.3095361590385437, "logits/rejected": -0.1564372330904007, "logps/chosen": -4.769049167633057, "logps/rejected": -5.169442653656006, "loss": 0.0535, "rewards/accuracies": 0.65625, "rewards/chosen": -4.769049167633057, "rewards/margins": 0.40039342641830444, "rewards/rejected": -5.169442653656006, "sft_loss": 4.430898189544678, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 0.6090834044108917, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.2508835196495056, "logits/rejected": -0.04892207309603691, "logps/chosen": -4.640468597412109, "logps/rejected": -5.314169883728027, "loss": 0.0544, "rewards/accuracies": 0.59375, "rewards/chosen": -4.640468597412109, "rewards/margins": 0.6737015843391418, "rewards/rejected": -5.314169883728027, "sft_loss": 4.312269687652588, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 0.6570091722216572, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.3884831666946411, "logits/rejected": -0.15569785237312317, "logps/chosen": -4.644750118255615, "logps/rejected": -5.068819999694824, "loss": 0.0532, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.644750118255615, "rewards/margins": 0.4240697920322418, "rewards/rejected": -5.068819999694824, "sft_loss": 4.279450416564941, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 0.8599736694295398, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.3107752501964569, "logits/rejected": -0.13118943572044373, "logps/chosen": -4.980652332305908, "logps/rejected": -5.300137519836426, "loss": 0.0549, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.980652332305908, "rewards/margins": 0.31948500871658325, "rewards/rejected": -5.300137519836426, "sft_loss": 4.596789360046387, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 0.5817887629532634, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.47135478258132935, "logits/rejected": -0.10487018525600433, "logps/chosen": -4.572172164916992, "logps/rejected": -4.95493221282959, "loss": 0.0537, "rewards/accuracies": 0.65625, "rewards/chosen": -4.572172164916992, "rewards/margins": 0.38275963068008423, "rewards/rejected": -4.95493221282959, "sft_loss": 4.365973949432373, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 0.8546776312375493, "learning_rate": 9.90464666629803e-07, "logits/chosen": -0.305539071559906, "logits/rejected": -0.18519474565982819, "logps/chosen": -4.459600925445557, "logps/rejected": -4.81002140045166, "loss": 0.0551, "rewards/accuracies": 0.5625, "rewards/chosen": -4.459600925445557, "rewards/margins": 0.3504212200641632, "rewards/rejected": -4.81002140045166, "sft_loss": 4.2054009437561035, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 0.9477596077907503, "learning_rate": 9.901595837463363e-07, "logits/chosen": -0.3723621070384979, "logits/rejected": -0.12102153152227402, "logps/chosen": -4.758284091949463, "logps/rejected": -5.178713798522949, "loss": 0.0539, "rewards/accuracies": 0.625, "rewards/chosen": -4.758284091949463, "rewards/margins": 0.42043009400367737, "rewards/rejected": -5.178713798522949, "sft_loss": 4.508724212646484, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 0.5092167105949217, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.3669915199279785, "logits/rejected": -0.2551936209201813, "logps/chosen": -4.825407981872559, "logps/rejected": -5.157772064208984, "loss": 0.0533, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.825407981872559, "rewards/margins": 0.33236438035964966, "rewards/rejected": -5.157772064208984, "sft_loss": 4.503109931945801, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 0.9726713592254466, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.39609891176223755, "logits/rejected": -0.21996548771858215, "logps/chosen": -4.433319091796875, "logps/rejected": -4.735190391540527, "loss": 0.0546, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.433319091796875, "rewards/margins": 0.301870733499527, "rewards/rejected": -4.735190391540527, "sft_loss": 4.102834224700928, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 0.7422075006162999, "learning_rate": 9.892158139836724e-07, "logits/chosen": -0.27023711800575256, "logits/rejected": -0.12920354306697845, "logps/chosen": -4.826740741729736, "logps/rejected": -4.9720458984375, "loss": 0.0548, "rewards/accuracies": 0.5625, "rewards/chosen": -4.826740741729736, "rewards/margins": 0.1453053057193756, "rewards/rejected": -4.9720458984375, "sft_loss": 4.488923072814941, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 0.4899561851874869, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.33869457244873047, "logits/rejected": -0.20442715287208557, "logps/chosen": -4.991418361663818, "logps/rejected": -5.21295690536499, "loss": 0.0537, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.991418361663818, "rewards/margins": 0.22153854370117188, "rewards/rejected": -5.21295690536499, "sft_loss": 4.614162445068359, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 1.4087474281540486, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.3120538592338562, "logits/rejected": -0.055802445858716965, "logps/chosen": -4.461789608001709, "logps/rejected": -4.915839195251465, "loss": 0.0537, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.461789608001709, "rewards/margins": 0.4540492594242096, "rewards/rejected": -4.915839195251465, "sft_loss": 4.131707191467285, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 0.3428163959894264, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.30345502495765686, "logits/rejected": -0.16580362617969513, "logps/chosen": -4.670796871185303, "logps/rejected": -5.047994613647461, "loss": 0.0539, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.670796871185303, "rewards/margins": 0.3771972060203552, "rewards/rejected": -5.047994613647461, "sft_loss": 4.354706764221191, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 0.3759873331174023, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.35124725103378296, "logits/rejected": -0.07594305276870728, "logps/chosen": -4.675352573394775, "logps/rejected": -5.124632835388184, "loss": 0.0527, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.675352573394775, "rewards/margins": 0.44928035140037537, "rewards/rejected": -5.124632835388184, "sft_loss": 4.424468040466309, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 0.697669128896546, "learning_rate": 9.875479798975512e-07, "logits/chosen": -0.1969275176525116, "logits/rejected": 0.06542123854160309, "logps/chosen": -4.537474632263184, "logps/rejected": -5.131258964538574, "loss": 0.0527, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.537474632263184, "rewards/margins": 0.5937844514846802, "rewards/rejected": -5.131258964538574, "sft_loss": 4.197497367858887, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 0.5767388364824774, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.21197989583015442, "logits/rejected": 0.06789512932300568, "logps/chosen": -4.644294261932373, "logps/rejected": -4.911935806274414, "loss": 0.054, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.644294261932373, "rewards/margins": 0.2676416039466858, "rewards/rejected": -4.911935806274414, "sft_loss": 4.177750110626221, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 1.0344418790294128, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.21944479644298553, "logits/rejected": -0.11954133212566376, "logps/chosen": -4.733977794647217, "logps/rejected": -5.374486446380615, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.733977794647217, "rewards/margins": 0.6405088305473328, "rewards/rejected": -5.374486446380615, "sft_loss": 4.4577178955078125, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 0.8658733433242235, "learning_rate": 9.864904911516383e-07, "logits/chosen": -0.18332402408123016, "logits/rejected": -0.07167172431945801, "logps/chosen": -4.696061611175537, "logps/rejected": -5.125707149505615, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.696061611175537, "rewards/margins": 0.42964568734169006, "rewards/rejected": -5.125707149505615, "sft_loss": 4.375224590301514, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 0.8231450884283922, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.1758008599281311, "logits/rejected": -0.06344632804393768, "logps/chosen": -4.6443257331848145, "logps/rejected": -4.973420143127441, "loss": 0.0552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.6443257331848145, "rewards/margins": 0.329093873500824, "rewards/rejected": -4.973420143127441, "sft_loss": 4.3561811447143555, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 0.46591650849175953, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.3961028456687927, "logits/rejected": -0.15232224762439728, "logps/chosen": -4.621685028076172, "logps/rejected": -5.0412163734436035, "loss": 0.0539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.621685028076172, "rewards/margins": 0.4195311665534973, "rewards/rejected": -5.0412163734436035, "sft_loss": 4.4617919921875, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 1.0101262696651687, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.3851791024208069, "logits/rejected": -0.15365764498710632, "logps/chosen": -4.548623085021973, "logps/rejected": -4.8963422775268555, "loss": 0.0542, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.548623085021973, "rewards/margins": 0.34772005677223206, "rewards/rejected": -4.8963422775268555, "sft_loss": 4.327893257141113, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 0.5381428324831616, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.2905058264732361, "logits/rejected": -0.006855395622551441, "logps/chosen": -4.724823951721191, "logps/rejected": -5.080996036529541, "loss": 0.0541, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.724823951721191, "rewards/margins": 0.35617202520370483, "rewards/rejected": -5.080996036529541, "sft_loss": 4.46238899230957, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 0.5495692093454033, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.1810792237520218, "logits/rejected": 0.015948548913002014, "logps/chosen": -4.573351860046387, "logps/rejected": -4.975152492523193, "loss": 0.054, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.573351860046387, "rewards/margins": 0.4018007218837738, "rewards/rejected": -4.975152492523193, "sft_loss": 4.238281726837158, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 0.8432393389554336, "learning_rate": 9.842481723427704e-07, "logits/chosen": -0.16392990946769714, "logits/rejected": -0.1051705926656723, "logps/chosen": -4.922740936279297, "logps/rejected": -5.354665279388428, "loss": 0.0547, "rewards/accuracies": 0.59375, "rewards/chosen": -4.922740936279297, "rewards/margins": 0.4319241940975189, "rewards/rejected": -5.354665279388428, "sft_loss": 4.64121150970459, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 0.5154826838857, "learning_rate": 9.838579873682658e-07, "logits/chosen": -0.21759569644927979, "logits/rejected": -0.22146296501159668, "logps/chosen": -4.731788158416748, "logps/rejected": -5.0571770668029785, "loss": 0.0547, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.731788158416748, "rewards/margins": 0.3253888487815857, "rewards/rejected": -5.0571770668029785, "sft_loss": 4.384237289428711, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 0.9101370133418469, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.3886938691139221, "logits/rejected": -0.09766797721385956, "logps/chosen": -4.5959296226501465, "logps/rejected": -5.024158477783203, "loss": 0.0527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.5959296226501465, "rewards/margins": 0.4282284677028656, "rewards/rejected": -5.024158477783203, "sft_loss": 4.394379138946533, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 0.41205584641739607, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.4098678231239319, "logits/rejected": -0.16409628093242645, "logps/chosen": -4.53586483001709, "logps/rejected": -4.860126972198486, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.53586483001709, "rewards/margins": 0.32426196336746216, "rewards/rejected": -4.860126972198486, "sft_loss": 4.254485130310059, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 0.5228974262769843, "learning_rate": 9.826592814608517e-07, "logits/chosen": -0.29943370819091797, "logits/rejected": -0.012423193082213402, "logps/chosen": -4.564153671264648, "logps/rejected": -4.881725311279297, "loss": 0.0536, "rewards/accuracies": 0.5625, "rewards/chosen": -4.564153671264648, "rewards/margins": 0.31757181882858276, "rewards/rejected": -4.881725311279297, "sft_loss": 4.236629962921143, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 0.48911677587093744, "learning_rate": 9.822503420858067e-07, "logits/chosen": -0.16598041355609894, "logits/rejected": -0.18363508582115173, "logps/chosen": -4.710743427276611, "logps/rejected": -4.974171161651611, "loss": 0.0537, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.710743427276611, "rewards/margins": 0.26342788338661194, "rewards/rejected": -4.974171161651611, "sft_loss": 4.399572849273682, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 0.8522763924193989, "learning_rate": 9.818367239158277e-07, "logits/chosen": -0.20104625821113586, "logits/rejected": -0.13274021446704865, "logps/chosen": -4.837375640869141, "logps/rejected": -5.168421745300293, "loss": 0.0531, "rewards/accuracies": 0.59375, "rewards/chosen": -4.837375640869141, "rewards/margins": 0.33104628324508667, "rewards/rejected": -5.168421745300293, "sft_loss": 4.40709924697876, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 0.9809109204437388, "learning_rate": 9.8141843096384e-07, "logits/chosen": -0.24949832260608673, "logits/rejected": -0.013175847008824348, "logps/chosen": -4.6659345626831055, "logps/rejected": -5.258612632751465, "loss": 0.0535, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.6659345626831055, "rewards/margins": 0.592678427696228, "rewards/rejected": -5.258612632751465, "sft_loss": 4.3432440757751465, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 0.6177294989073903, "learning_rate": 9.809954672881237e-07, "logits/chosen": -0.2501566410064697, "logits/rejected": -0.04616551846265793, "logps/chosen": -4.930415153503418, "logps/rejected": -5.398449897766113, "loss": 0.0538, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.930415153503418, "rewards/margins": 0.4680354595184326, "rewards/rejected": -5.398449897766113, "sft_loss": 4.56772518157959, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 0.7477611616553518, "learning_rate": 9.80567836992274e-07, "logits/chosen": -0.3545466959476471, "logits/rejected": -0.10054433345794678, "logps/chosen": -4.5024824142456055, "logps/rejected": -4.952744960784912, "loss": 0.0546, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.5024824142456055, "rewards/margins": 0.4502628445625305, "rewards/rejected": -4.952744960784912, "sft_loss": 4.22027063369751, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 0.6168398978776214, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.4304068982601166, "logits/rejected": -0.18468213081359863, "logps/chosen": -4.629603385925293, "logps/rejected": -4.93204402923584, "loss": 0.0542, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.629603385925293, "rewards/margins": 0.30244094133377075, "rewards/rejected": -4.93204402923584, "sft_loss": 4.375184059143066, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 0.5481544408539222, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.41533294320106506, "logits/rejected": -0.15929146111011505, "logps/chosen": -4.695633888244629, "logps/rejected": -5.094050407409668, "loss": 0.0527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.695633888244629, "rewards/margins": 0.398416668176651, "rewards/rejected": -5.094050407409668, "sft_loss": 4.4184675216674805, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 0.4981215224708011, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.35322266817092896, "logits/rejected": -0.1806357502937317, "logps/chosen": -4.483834743499756, "logps/rejected": -4.888542652130127, "loss": 0.0532, "rewards/accuracies": 0.6875, "rewards/chosen": -4.483834743499756, "rewards/margins": 0.40470829606056213, "rewards/rejected": -4.888542652130127, "sft_loss": 4.222219944000244, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 0.6564438791162569, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.25478994846343994, "logits/rejected": -0.17686933279037476, "logps/chosen": -4.613704204559326, "logps/rejected": -4.8227033615112305, "loss": 0.0562, "rewards/accuracies": 0.59375, "rewards/chosen": -4.613704204559326, "rewards/margins": 0.2089988738298416, "rewards/rejected": -4.8227033615112305, "sft_loss": 4.313691139221191, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 0.7392853368692655, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.44997015595436096, "logits/rejected": -0.27288126945495605, "logps/chosen": -4.7664642333984375, "logps/rejected": -5.0896687507629395, "loss": 0.0537, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.7664642333984375, "rewards/margins": 0.3232039511203766, "rewards/rejected": -5.0896687507629395, "sft_loss": 4.511096000671387, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 0.6876145426311351, "learning_rate": 9.779042916953376e-07, "logits/chosen": -0.3371017575263977, "logits/rejected": -0.06301428377628326, "logps/chosen": -4.491923809051514, "logps/rejected": -5.174212455749512, "loss": 0.0523, "rewards/accuracies": 0.71875, "rewards/chosen": -4.491923809051514, "rewards/margins": 0.6822883486747742, "rewards/rejected": -5.174212455749512, "sft_loss": 4.314373970031738, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 0.7139936287057881, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.3784538507461548, "logits/rejected": -0.17158463597297668, "logps/chosen": -4.396938323974609, "logps/rejected": -4.944519996643066, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.396938323974609, "rewards/margins": 0.5475821495056152, "rewards/rejected": -4.944519996643066, "sft_loss": 4.0358452796936035, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 0.5892634440060744, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.3060999810695648, "logits/rejected": -0.1631624698638916, "logps/chosen": -4.6079511642456055, "logps/rejected": -5.2234787940979, "loss": 0.0527, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.6079511642456055, "rewards/margins": 0.6155272126197815, "rewards/rejected": -5.2234787940979, "sft_loss": 4.3594512939453125, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 0.929026425003863, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.18277141451835632, "logits/rejected": -0.1487278938293457, "logps/chosen": -4.66930627822876, "logps/rejected": -5.10184907913208, "loss": 0.0534, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.66930627822876, "rewards/margins": 0.4325428009033203, "rewards/rejected": -5.10184907913208, "sft_loss": 4.38519287109375, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 1.1507620636323597, "learning_rate": 9.76035805036924e-07, "logits/chosen": -0.12372313439846039, "logits/rejected": 0.09936396777629852, "logps/chosen": -4.571523189544678, "logps/rejected": -5.108660697937012, "loss": 0.0524, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.571523189544678, "rewards/margins": 0.5371370911598206, "rewards/rejected": -5.108660697937012, "sft_loss": 4.278738021850586, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 1.1625783677852743, "learning_rate": 9.755571256763764e-07, "logits/chosen": -0.12434210628271103, "logits/rejected": 0.043297264724969864, "logps/chosen": -4.410771369934082, "logps/rejected": -4.947856426239014, "loss": 0.053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.410771369934082, "rewards/margins": 0.5370848178863525, "rewards/rejected": -4.947856426239014, "sft_loss": 4.148685932159424, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 0.7569548293124136, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.26663917303085327, "logits/rejected": 0.0644756332039833, "logps/chosen": -4.429556846618652, "logps/rejected": -4.855227470397949, "loss": 0.0542, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.429556846618652, "rewards/margins": 0.42567119002342224, "rewards/rejected": -4.855227470397949, "sft_loss": 4.217803001403809, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 0.5098138301352244, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.1906522810459137, "logits/rejected": -0.0011430894955992699, "logps/chosen": -4.641746997833252, "logps/rejected": -5.193634986877441, "loss": 0.0528, "rewards/accuracies": 0.65625, "rewards/chosen": -4.641746997833252, "rewards/margins": 0.5518878698348999, "rewards/rejected": -5.193634986877441, "sft_loss": 4.367932319641113, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 0.5456950404150688, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.3177880346775055, "logits/rejected": -0.1907317340373993, "logps/chosen": -4.793037414550781, "logps/rejected": -5.058445930480957, "loss": 0.0543, "rewards/accuracies": 0.625, "rewards/chosen": -4.793037414550781, "rewards/margins": 0.265408456325531, "rewards/rejected": -5.058445930480957, "sft_loss": 4.428118705749512, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 0.6987818198293332, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.23779654502868652, "logits/rejected": -0.053666941821575165, "logps/chosen": -4.703165531158447, "logps/rejected": -5.080471992492676, "loss": 0.0532, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.703165531158447, "rewards/margins": 0.3773062825202942, "rewards/rejected": -5.080471992492676, "sft_loss": 4.4036335945129395, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 0.4862054708330395, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.2143319547176361, "logits/rejected": -0.09488032013177872, "logps/chosen": -4.576449394226074, "logps/rejected": -4.933585166931152, "loss": 0.0529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.576449394226074, "rewards/margins": 0.35713550448417664, "rewards/rejected": -4.933585166931152, "sft_loss": 4.224600791931152, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 0.5843251817725653, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.4183814525604248, "logits/rejected": -0.20347313582897186, "logps/chosen": -4.669787406921387, "logps/rejected": -5.200324058532715, "loss": 0.0533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.669787406921387, "rewards/margins": 0.5305365324020386, "rewards/rejected": -5.200324058532715, "sft_loss": 4.445235252380371, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 0.6456408033834538, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.2532975971698761, "logits/rejected": -0.04155152291059494, "logps/chosen": -4.547806262969971, "logps/rejected": -5.053961753845215, "loss": 0.0527, "rewards/accuracies": 0.65625, "rewards/chosen": -4.547806262969971, "rewards/margins": 0.5061560869216919, "rewards/rejected": -5.053961753845215, "sft_loss": 4.219943046569824, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 0.5237674186864618, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.36289113759994507, "logits/rejected": -0.2682749032974243, "logps/chosen": -4.5317912101745605, "logps/rejected": -4.91931676864624, "loss": 0.0527, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.5317912101745605, "rewards/margins": 0.3875252604484558, "rewards/rejected": -4.91931676864624, "sft_loss": 4.134699821472168, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 0.7270955294325449, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.30084487795829773, "logits/rejected": -0.07306574285030365, "logps/chosen": -4.786416530609131, "logps/rejected": -5.113640785217285, "loss": 0.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.786416530609131, "rewards/margins": 0.3272242546081543, "rewards/rejected": -5.113640785217285, "sft_loss": 4.5564351081848145, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 0.7225983756880933, "learning_rate": 9.705173583245643e-07, "logits/chosen": -0.32694125175476074, "logits/rejected": -0.05704299733042717, "logps/chosen": -4.673354625701904, "logps/rejected": -5.159552574157715, "loss": 0.0529, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.673354625701904, "rewards/margins": 0.48619788885116577, "rewards/rejected": -5.159552574157715, "sft_loss": 4.341565132141113, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 0.6272602306537511, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.40429896116256714, "logits/rejected": -0.23206374049186707, "logps/chosen": -4.349539756774902, "logps/rejected": -4.834110260009766, "loss": 0.0525, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.349539756774902, "rewards/margins": 0.4845706820487976, "rewards/rejected": -4.834110260009766, "sft_loss": 4.087153434753418, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 0.664900836717412, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.3435072600841522, "logits/rejected": -0.05373241752386093, "logps/chosen": -4.66226863861084, "logps/rejected": -5.135153293609619, "loss": 0.0531, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.66226863861084, "rewards/margins": 0.47288474440574646, "rewards/rejected": -5.135153293609619, "sft_loss": 4.391148090362549, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 0.46821226895754736, "learning_rate": 9.689161844071755e-07, "logits/chosen": -0.1869077980518341, "logits/rejected": -0.06527513265609741, "logps/chosen": -4.599356651306152, "logps/rejected": -4.932245254516602, "loss": 0.0535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.599356651306152, "rewards/margins": 0.3328891396522522, "rewards/rejected": -4.932245254516602, "sft_loss": 4.303515434265137, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 0.681156914428372, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.35399603843688965, "logits/rejected": -0.0633891299366951, "logps/chosen": -4.58389949798584, "logps/rejected": -4.945822715759277, "loss": 0.0532, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.58389949798584, "rewards/margins": 0.3619235157966614, "rewards/rejected": -4.945822715759277, "sft_loss": 4.220614433288574, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 0.48492189227810406, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.32695597410202026, "logits/rejected": -0.29391515254974365, "logps/chosen": -4.7623701095581055, "logps/rejected": -5.030303001403809, "loss": 0.054, "rewards/accuracies": 0.59375, "rewards/chosen": -4.7623701095581055, "rewards/margins": 0.26793205738067627, "rewards/rejected": -5.030303001403809, "sft_loss": 4.4485955238342285, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 0.6311037618756278, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.44052833318710327, "logits/rejected": -0.262589693069458, "logps/chosen": -4.548510551452637, "logps/rejected": -5.029641151428223, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.548510551452637, "rewards/margins": 0.4811309278011322, "rewards/rejected": -5.029641151428223, "sft_loss": 4.205142498016357, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 0.5356140327074448, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.3407883048057556, "logits/rejected": -0.17592433094978333, "logps/chosen": -4.670349597930908, "logps/rejected": -5.063414573669434, "loss": 0.0528, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.670349597930908, "rewards/margins": 0.3930647373199463, "rewards/rejected": -5.063414573669434, "sft_loss": 4.288704872131348, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 0.5224161429228669, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.2652224898338318, "logits/rejected": -0.16097518801689148, "logps/chosen": -4.668039798736572, "logps/rejected": -5.022774696350098, "loss": 0.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.668039798736572, "rewards/margins": 0.3547355532646179, "rewards/rejected": -5.022774696350098, "sft_loss": 4.322123050689697, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 0.5931763351107936, "learning_rate": 9.655911462268327e-07, "logits/chosen": -0.26682934165000916, "logits/rejected": -0.12901510298252106, "logps/chosen": -4.589568138122559, "logps/rejected": -5.130476951599121, "loss": 0.0519, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.589568138122559, "rewards/margins": 0.5409084558486938, "rewards/rejected": -5.130476951599121, "sft_loss": 4.2350568771362305, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 0.4542520418662769, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.3942444920539856, "logits/rejected": -0.2551219165325165, "logps/chosen": -4.672030448913574, "logps/rejected": -4.914698123931885, "loss": 0.0556, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.672030448913574, "rewards/margins": 0.24266783893108368, "rewards/rejected": -4.914698123931885, "sft_loss": 4.431342124938965, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 0.5438517815194526, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.4611254632472992, "logits/rejected": -0.2544347643852234, "logps/chosen": -4.990399360656738, "logps/rejected": -5.296509742736816, "loss": 0.0535, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.990399360656738, "rewards/margins": 0.30610987544059753, "rewards/rejected": -5.296509742736816, "sft_loss": 4.532609462738037, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 0.6798316854469919, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.4359145164489746, "logits/rejected": -0.18338151276111603, "logps/chosen": -4.589885234832764, "logps/rejected": -5.0170488357543945, "loss": 0.0534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.589885234832764, "rewards/margins": 0.4271632730960846, "rewards/rejected": -5.0170488357543945, "sft_loss": 4.323793411254883, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 0.4468539269210184, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.429384708404541, "logits/rejected": -0.1899581104516983, "logps/chosen": -4.401920795440674, "logps/rejected": -4.892082214355469, "loss": 0.0535, "rewards/accuracies": 0.6875, "rewards/chosen": -4.401920795440674, "rewards/margins": 0.49016109108924866, "rewards/rejected": -4.892082214355469, "sft_loss": 4.202319145202637, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 0.6567316541688504, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.3670702874660492, "logits/rejected": -0.15794377028942108, "logps/chosen": -4.501652240753174, "logps/rejected": -5.101126670837402, "loss": 0.0528, "rewards/accuracies": 0.65625, "rewards/chosen": -4.501652240753174, "rewards/margins": 0.5994741916656494, "rewards/rejected": -5.101126670837402, "sft_loss": 4.22614049911499, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 0.8362449587781202, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.4296230375766754, "logits/rejected": -0.22723379731178284, "logps/chosen": -4.814973831176758, "logps/rejected": -5.287397861480713, "loss": 0.0536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.814973831176758, "rewards/margins": 0.47242408990859985, "rewards/rejected": -5.287397861480713, "sft_loss": 4.558465957641602, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 0.7260413821003713, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.2747848927974701, "logits/rejected": -0.09800789505243301, "logps/chosen": -4.492433071136475, "logps/rejected": -4.882441520690918, "loss": 0.0525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.492433071136475, "rewards/margins": 0.3900087773799896, "rewards/rejected": -4.882441520690918, "sft_loss": 4.168577671051025, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 1.0721944049489407, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.31729406118392944, "logits/rejected": -0.25745829939842224, "logps/chosen": -4.472096920013428, "logps/rejected": -4.868285655975342, "loss": 0.0533, "rewards/accuracies": 0.65625, "rewards/chosen": -4.472096920013428, "rewards/margins": 0.396188348531723, "rewards/rejected": -4.868285655975342, "sft_loss": 4.1427130699157715, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.2683974504470825, "eval_logits/rejected": 0.40141844749450684, "eval_logps/chosen": -4.519753456115723, "eval_logps/rejected": -5.018647193908691, "eval_loss": 0.05182144418358803, "eval_rewards/accuracies": 0.6483679413795471, "eval_rewards/chosen": -4.519753456115723, "eval_rewards/margins": 0.4988936185836792, "eval_rewards/rejected": -5.018647193908691, "eval_runtime": 42.9586, "eval_samples_per_second": 31.309, "eval_sft_loss": 4.150440692901611, "eval_steps_per_second": 7.845, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 0.7156317001015575, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.3744416832923889, "logits/rejected": -0.13187824189662933, "logps/chosen": -4.595070838928223, "logps/rejected": -5.1346435546875, "loss": 0.0521, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.595070838928223, "rewards/margins": 0.5395724177360535, "rewards/rejected": -5.1346435546875, "sft_loss": 4.348048210144043, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 0.6367294977520913, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.3582982122898102, "logits/rejected": -0.048285432159900665, "logps/chosen": -4.640942573547363, "logps/rejected": -5.121342182159424, "loss": 0.0539, "rewards/accuracies": 0.65625, "rewards/chosen": -4.640942573547363, "rewards/margins": 0.4803994297981262, "rewards/rejected": -5.121342182159424, "sft_loss": 4.395642280578613, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 1.4858558236187764, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.41571682691574097, "logits/rejected": -0.20789651572704315, "logps/chosen": -4.591765880584717, "logps/rejected": -4.994922637939453, "loss": 0.0535, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.591765880584717, "rewards/margins": 0.403156578540802, "rewards/rejected": -4.994922637939453, "sft_loss": 4.2717695236206055, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 0.5323772990644925, "learning_rate": 9.584544477031816e-07, "logits/chosen": -0.18281269073486328, "logits/rejected": -0.012447918765246868, "logps/chosen": -4.610562324523926, "logps/rejected": -4.983885765075684, "loss": 0.0533, "rewards/accuracies": 0.65625, "rewards/chosen": -4.610562324523926, "rewards/margins": 0.37332338094711304, "rewards/rejected": -4.983885765075684, "sft_loss": 4.315565586090088, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 0.5854916339421308, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.44287237524986267, "logits/rejected": -0.1460832804441452, "logps/chosen": -4.5870513916015625, "logps/rejected": -4.914314270019531, "loss": 0.0532, "rewards/accuracies": 0.625, "rewards/chosen": -4.5870513916015625, "rewards/margins": 0.32726341485977173, "rewards/rejected": -4.914314270019531, "sft_loss": 4.275424003601074, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 0.5795088883824004, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.4347182810306549, "logits/rejected": -0.28513267636299133, "logps/chosen": -4.611976623535156, "logps/rejected": -4.973719120025635, "loss": 0.0544, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.611976623535156, "rewards/margins": 0.36174243688583374, "rewards/rejected": -4.973719120025635, "sft_loss": 4.437606334686279, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 0.34028010927917013, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.4847342371940613, "logits/rejected": -0.21808162331581116, "logps/chosen": -4.651471138000488, "logps/rejected": -5.037489891052246, "loss": 0.053, "rewards/accuracies": 0.65625, "rewards/chosen": -4.651471138000488, "rewards/margins": 0.3860177993774414, "rewards/rejected": -5.037489891052246, "sft_loss": 4.363861083984375, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 0.3618440750506579, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.5013711452484131, "logits/rejected": -0.34211140871047974, "logps/chosen": -4.682461261749268, "logps/rejected": -4.998291492462158, "loss": 0.0531, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.682461261749268, "rewards/margins": 0.31583017110824585, "rewards/rejected": -4.998291492462158, "sft_loss": 4.3636016845703125, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 0.518393886973729, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.41425085067749023, "logits/rejected": -0.17347685992717743, "logps/chosen": -4.481367588043213, "logps/rejected": -4.9345831871032715, "loss": 0.0533, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.481367588043213, "rewards/margins": 0.4532155394554138, "rewards/rejected": -4.9345831871032715, "sft_loss": 4.235280513763428, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 0.4137314181635811, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.45675116777420044, "logits/rejected": -0.15419267117977142, "logps/chosen": -4.616604804992676, "logps/rejected": -4.993886947631836, "loss": 0.0535, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.616604804992676, "rewards/margins": 0.3772817552089691, "rewards/rejected": -4.993886947631836, "sft_loss": 4.367188930511475, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 0.5601409436047087, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.3995305895805359, "logits/rejected": -0.24616310000419617, "logps/chosen": -4.687084674835205, "logps/rejected": -5.09484338760376, "loss": 0.0542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.687084674835205, "rewards/margins": 0.407759428024292, "rewards/rejected": -5.09484338760376, "sft_loss": 4.395379066467285, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 0.8031738031976163, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.4419690668582916, "logits/rejected": -0.23541542887687683, "logps/chosen": -4.723730564117432, "logps/rejected": -5.2569379806518555, "loss": 0.0529, "rewards/accuracies": 0.65625, "rewards/chosen": -4.723730564117432, "rewards/margins": 0.5332074165344238, "rewards/rejected": -5.2569379806518555, "sft_loss": 4.406943321228027, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 0.7204423519976543, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.32574447989463806, "logits/rejected": -0.040873341262340546, "logps/chosen": -4.396466255187988, "logps/rejected": -4.99883508682251, "loss": 0.0528, "rewards/accuracies": 0.65625, "rewards/chosen": -4.396466255187988, "rewards/margins": 0.6023694276809692, "rewards/rejected": -4.99883508682251, "sft_loss": 4.184441566467285, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 0.4751320652431749, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.43547144532203674, "logits/rejected": -0.3618202805519104, "logps/chosen": -4.632751941680908, "logps/rejected": -4.960574150085449, "loss": 0.055, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.632751941680908, "rewards/margins": 0.3278222680091858, "rewards/rejected": -4.960574150085449, "sft_loss": 4.420085906982422, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 0.5521181566167117, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.49553728103637695, "logits/rejected": -0.3846838176250458, "logps/chosen": -4.746697425842285, "logps/rejected": -5.085881233215332, "loss": 0.0535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.746697425842285, "rewards/margins": 0.33918410539627075, "rewards/rejected": -5.085881233215332, "sft_loss": 4.425386905670166, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 0.6424572601366976, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.35927778482437134, "logits/rejected": -0.1263047158718109, "logps/chosen": -4.6220550537109375, "logps/rejected": -4.97434663772583, "loss": 0.0527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.6220550537109375, "rewards/margins": 0.3522917628288269, "rewards/rejected": -4.97434663772583, "sft_loss": 4.350966453552246, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 0.40220128996335763, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.37630099058151245, "logits/rejected": -0.10924334824085236, "logps/chosen": -4.406384468078613, "logps/rejected": -4.704051494598389, "loss": 0.0531, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.406384468078613, "rewards/margins": 0.2976674437522888, "rewards/rejected": -4.704051494598389, "sft_loss": 4.099337100982666, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 0.3608110540736878, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.4187045097351074, "logits/rejected": -0.20366141200065613, "logps/chosen": -4.68743371963501, "logps/rejected": -5.046011447906494, "loss": 0.0536, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.68743371963501, "rewards/margins": 0.358577698469162, "rewards/rejected": -5.046011447906494, "sft_loss": 4.357827186584473, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 0.5472169705894456, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.32792991399765015, "logits/rejected": -0.3828414976596832, "logps/chosen": -4.738595485687256, "logps/rejected": -5.188597679138184, "loss": 0.0532, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.738595485687256, "rewards/margins": 0.4500022530555725, "rewards/rejected": -5.188597679138184, "sft_loss": 4.347311973571777, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 0.4557106046597545, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.342276394367218, "logits/rejected": 0.016852790489792824, "logps/chosen": -4.5591607093811035, "logps/rejected": -5.212889671325684, "loss": 0.0514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.5591607093811035, "rewards/margins": 0.6537296772003174, "rewards/rejected": -5.212889671325684, "sft_loss": 4.230761528015137, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 0.5654653462106812, "learning_rate": 9.472503898067645e-07, "logits/chosen": -0.16376064717769623, "logits/rejected": -0.11671161651611328, "logps/chosen": -4.611720085144043, "logps/rejected": -4.8727312088012695, "loss": 0.0548, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.611720085144043, "rewards/margins": 0.2610107958316803, "rewards/rejected": -4.8727312088012695, "sft_loss": 4.201091289520264, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 0.5870624108603953, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.18181738257408142, "logits/rejected": -0.09554007649421692, "logps/chosen": -4.874770164489746, "logps/rejected": -5.174876689910889, "loss": 0.0532, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.874770164489746, "rewards/margins": 0.3001064956188202, "rewards/rejected": -5.174876689910889, "sft_loss": 4.453106880187988, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 0.5024960047819973, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.3122777044773102, "logits/rejected": -0.08371108025312424, "logps/chosen": -4.6728692054748535, "logps/rejected": -5.066531181335449, "loss": 0.0526, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.6728692054748535, "rewards/margins": 0.3936619758605957, "rewards/rejected": -5.066531181335449, "sft_loss": 4.272462368011475, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 0.5431723145779719, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.46850499510765076, "logits/rejected": -0.2023623287677765, "logps/chosen": -4.442728519439697, "logps/rejected": -5.026824951171875, "loss": 0.0519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.442728519439697, "rewards/margins": 0.5840964913368225, "rewards/rejected": -5.026824951171875, "sft_loss": 4.181841850280762, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 0.546519278936773, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.1584623157978058, "logits/rejected": -0.028326964005827904, "logps/chosen": -4.671021938323975, "logps/rejected": -5.056215763092041, "loss": 0.0533, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.671021938323975, "rewards/margins": 0.3851930797100067, "rewards/rejected": -5.056215763092041, "sft_loss": 4.2725138664245605, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 0.9667671251257445, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.2628821134567261, "logits/rejected": -0.10308748483657837, "logps/chosen": -4.612080097198486, "logps/rejected": -4.99003791809082, "loss": 0.0539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.612080097198486, "rewards/margins": 0.3779585063457489, "rewards/rejected": -4.99003791809082, "sft_loss": 4.356131076812744, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 0.3846431850386535, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.2996244430541992, "logits/rejected": -0.15662053227424622, "logps/chosen": -4.658416271209717, "logps/rejected": -5.115981578826904, "loss": 0.0524, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.658416271209717, "rewards/margins": 0.45756563544273376, "rewards/rejected": -5.115981578826904, "sft_loss": 4.365849494934082, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 0.665935476867161, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.29253220558166504, "logits/rejected": -0.25073716044425964, "logps/chosen": -4.465968608856201, "logps/rejected": -4.916172981262207, "loss": 0.0527, "rewards/accuracies": 0.6875, "rewards/chosen": -4.465968608856201, "rewards/margins": 0.4502039849758148, "rewards/rejected": -4.916172981262207, "sft_loss": 4.195641994476318, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 0.5706511414826213, "learning_rate": 9.415420190240225e-07, "logits/chosen": -0.26913461089134216, "logits/rejected": 0.0276435948908329, "logps/chosen": -4.300182819366455, "logps/rejected": -5.042660713195801, "loss": 0.0504, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.300182819366455, "rewards/margins": 0.742477536201477, "rewards/rejected": -5.042660713195801, "sft_loss": 4.060831546783447, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 0.7175039337411655, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.21212784945964813, "logits/rejected": -0.13273796439170837, "logps/chosen": -4.553486347198486, "logps/rejected": -4.936342239379883, "loss": 0.0534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.553486347198486, "rewards/margins": 0.3828561007976532, "rewards/rejected": -4.936342239379883, "sft_loss": 4.191648960113525, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 0.63623683652012, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.28056463599205017, "logits/rejected": 0.1325863003730774, "logps/chosen": -4.82030725479126, "logps/rejected": -5.4860968589782715, "loss": 0.0518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.82030725479126, "rewards/margins": 0.6657902002334595, "rewards/rejected": -5.4860968589782715, "sft_loss": 4.4288787841796875, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 0.7126472216799089, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.26858845353126526, "logits/rejected": -0.1267586648464203, "logps/chosen": -4.539680480957031, "logps/rejected": -5.149550437927246, "loss": 0.053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.539680480957031, "rewards/margins": 0.6098700165748596, "rewards/rejected": -5.149550437927246, "sft_loss": 4.167618751525879, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 0.550303910887877, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.37861794233322144, "logits/rejected": -0.1660412847995758, "logps/chosen": -4.474730491638184, "logps/rejected": -4.919530391693115, "loss": 0.0522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.474730491638184, "rewards/margins": 0.44480031728744507, "rewards/rejected": -4.919530391693115, "sft_loss": 4.191833972930908, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 0.45261575047030345, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.33118414878845215, "logits/rejected": -0.13877174258232117, "logps/chosen": -4.586190223693848, "logps/rejected": -5.225762844085693, "loss": 0.0525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.586190223693848, "rewards/margins": 0.6395732760429382, "rewards/rejected": -5.225762844085693, "sft_loss": 4.352536678314209, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 0.9186138520912633, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.3902810513973236, "logits/rejected": -0.2011948525905609, "logps/chosen": -4.392946243286133, "logps/rejected": -4.966264724731445, "loss": 0.0523, "rewards/accuracies": 0.625, "rewards/chosen": -4.392946243286133, "rewards/margins": 0.5733183026313782, "rewards/rejected": -4.966264724731445, "sft_loss": 4.123747825622559, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 0.5968579774371027, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.4745398461818695, "logits/rejected": -0.30128827691078186, "logps/chosen": -4.585610389709473, "logps/rejected": -5.091395854949951, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.585610389709473, "rewards/margins": 0.5057860612869263, "rewards/rejected": -5.091395854949951, "sft_loss": 4.311280250549316, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 0.686411141611339, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.49996957182884216, "logits/rejected": -0.3136758804321289, "logps/chosen": -4.660679340362549, "logps/rejected": -5.197052955627441, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.660679340362549, "rewards/margins": 0.5363737344741821, "rewards/rejected": -5.197052955627441, "sft_loss": 4.413455009460449, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 0.7098920905319643, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.32590216398239136, "logits/rejected": -0.15566875040531158, "logps/chosen": -4.4894208908081055, "logps/rejected": -4.915673732757568, "loss": 0.0535, "rewards/accuracies": 0.65625, "rewards/chosen": -4.4894208908081055, "rewards/margins": 0.42625269293785095, "rewards/rejected": -4.915673732757568, "sft_loss": 4.128401279449463, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 0.7087262409431777, "learning_rate": 9.34021460937342e-07, "logits/chosen": -0.35985809564590454, "logits/rejected": -0.2987968325614929, "logps/chosen": -4.599359035491943, "logps/rejected": -4.990583419799805, "loss": 0.0536, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.599359035491943, "rewards/margins": 0.3912242352962494, "rewards/rejected": -4.990583419799805, "sft_loss": 4.331721782684326, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 0.7809553755456508, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.6510784029960632, "logits/rejected": -0.43074244260787964, "logps/chosen": -4.728677272796631, "logps/rejected": -5.084107875823975, "loss": 0.0536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.728677272796631, "rewards/margins": 0.3554309904575348, "rewards/rejected": -5.084107875823975, "sft_loss": 4.5105485916137695, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 0.8865709812275551, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.5979700088500977, "logits/rejected": -0.29173022508621216, "logps/chosen": -4.386279582977295, "logps/rejected": -4.95373010635376, "loss": 0.0531, "rewards/accuracies": 0.625, "rewards/chosen": -4.386279582977295, "rewards/margins": 0.5674503445625305, "rewards/rejected": -4.95373010635376, "sft_loss": 4.1684088706970215, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 0.6939758542037217, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.49798783659935, "logits/rejected": -0.2839065194129944, "logps/chosen": -4.452348232269287, "logps/rejected": -4.926584720611572, "loss": 0.0529, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.452348232269287, "rewards/margins": 0.47423630952835083, "rewards/rejected": -4.926584720611572, "sft_loss": 4.255247116088867, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 0.5896298856164389, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.37618792057037354, "logits/rejected": -0.2516014575958252, "logps/chosen": -4.588209629058838, "logps/rejected": -4.932555198669434, "loss": 0.0524, "rewards/accuracies": 0.625, "rewards/chosen": -4.588209629058838, "rewards/margins": 0.34434524178504944, "rewards/rejected": -4.932555198669434, "sft_loss": 4.211440086364746, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 0.5048753688209329, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.3792995810508728, "logits/rejected": -0.17535944283008575, "logps/chosen": -4.544643402099609, "logps/rejected": -5.277196884155273, "loss": 0.052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.544643402099609, "rewards/margins": 0.7325533032417297, "rewards/rejected": -5.277196884155273, "sft_loss": 4.257083415985107, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 0.5342795514278404, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.36811989545822144, "logits/rejected": -0.15171898901462555, "logps/chosen": -4.433106422424316, "logps/rejected": -5.020786285400391, "loss": 0.0541, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.433106422424316, "rewards/margins": 0.5876799821853638, "rewards/rejected": -5.020786285400391, "sft_loss": 4.114101409912109, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 0.535835981919384, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.4679451584815979, "logits/rejected": -0.3156459927558899, "logps/chosen": -4.755094528198242, "logps/rejected": -5.195936679840088, "loss": 0.0524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.755094528198242, "rewards/margins": 0.4408422112464905, "rewards/rejected": -5.195936679840088, "sft_loss": 4.333351135253906, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 0.842033204355517, "learning_rate": 9.277014915246792e-07, "logits/chosen": -0.38147813081741333, "logits/rejected": -0.31262868642807007, "logps/chosen": -4.662112236022949, "logps/rejected": -5.163732051849365, "loss": 0.0532, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.662112236022949, "rewards/margins": 0.5016202330589294, "rewards/rejected": -5.163732051849365, "sft_loss": 4.4246416091918945, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 0.41952140030922824, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.43312758207321167, "logits/rejected": -0.32057255506515503, "logps/chosen": -4.355666637420654, "logps/rejected": -4.759754657745361, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.355666637420654, "rewards/margins": 0.4040871560573578, "rewards/rejected": -4.759754657745361, "sft_loss": 4.02143669128418, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 0.5404524013697792, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.5744115114212036, "logits/rejected": -0.2570160925388336, "logps/chosen": -4.677066802978516, "logps/rejected": -5.198808193206787, "loss": 0.0522, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.677066802978516, "rewards/margins": 0.521741509437561, "rewards/rejected": -5.198808193206787, "sft_loss": 4.395870685577393, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 0.5278130869284325, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.4492534101009369, "logits/rejected": -0.31946879625320435, "logps/chosen": -4.7667155265808105, "logps/rejected": -4.992683410644531, "loss": 0.0537, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.7667155265808105, "rewards/margins": 0.2259684056043625, "rewards/rejected": -4.992683410644531, "sft_loss": 4.413228511810303, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 0.772880187311183, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.4478439688682556, "logits/rejected": -0.20472392439842224, "logps/chosen": -4.402605056762695, "logps/rejected": -5.060262680053711, "loss": 0.0522, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.402605056762695, "rewards/margins": 0.6576577425003052, "rewards/rejected": -5.060262680053711, "sft_loss": 4.161447048187256, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 0.7775561810812492, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.42596834897994995, "logits/rejected": -0.1455395519733429, "logps/chosen": -4.4289679527282715, "logps/rejected": -4.8820953369140625, "loss": 0.0517, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.4289679527282715, "rewards/margins": 0.45312729477882385, "rewards/rejected": -4.8820953369140625, "sft_loss": 4.115038871765137, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 0.36504463479697746, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.3154129981994629, "logits/rejected": -0.11409089714288712, "logps/chosen": -4.550224781036377, "logps/rejected": -4.950064659118652, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.550224781036377, "rewards/margins": 0.39984145760536194, "rewards/rejected": -4.950064659118652, "sft_loss": 4.2174882888793945, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 0.7277940278719969, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.3689365088939667, "logits/rejected": -0.37266969680786133, "logps/chosen": -4.800908088684082, "logps/rejected": -5.217124938964844, "loss": 0.0536, "rewards/accuracies": 0.625, "rewards/chosen": -4.800908088684082, "rewards/margins": 0.4162166118621826, "rewards/rejected": -5.217124938964844, "sft_loss": 4.4879150390625, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 0.5014357810674278, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.456494003534317, "logits/rejected": -0.20387740433216095, "logps/chosen": -4.5678606033325195, "logps/rejected": -5.184647560119629, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.5678606033325195, "rewards/margins": 0.6167860627174377, "rewards/rejected": -5.184647560119629, "sft_loss": 4.280999183654785, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 0.8280312948646985, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.17272579669952393, "logits/rejected": -0.06938336789608002, "logps/chosen": -4.48028039932251, "logps/rejected": -4.926800727844238, "loss": 0.0535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.48028039932251, "rewards/margins": 0.4465200901031494, "rewards/rejected": -4.926800727844238, "sft_loss": 4.15725040435791, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 0.43610312586235883, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.28351375460624695, "logits/rejected": -0.08770108222961426, "logps/chosen": -4.448962211608887, "logps/rejected": -4.971872329711914, "loss": 0.0512, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.448962211608887, "rewards/margins": 0.5229107141494751, "rewards/rejected": -4.971872329711914, "sft_loss": 3.9980177879333496, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 0.7384580305138214, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.317364364862442, "logits/rejected": -0.17575505375862122, "logps/chosen": -4.727007865905762, "logps/rejected": -5.317923545837402, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.727007865905762, "rewards/margins": 0.5909159779548645, "rewards/rejected": -5.317923545837402, "sft_loss": 4.3875274658203125, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 0.4737735564077406, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.29782262444496155, "logits/rejected": -0.20921726524829865, "logps/chosen": -4.792957782745361, "logps/rejected": -5.149683952331543, "loss": 0.0545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.792957782745361, "rewards/margins": 0.3567260503768921, "rewards/rejected": -5.149683952331543, "sft_loss": 4.424458026885986, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 0.5026818798426406, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.405927836894989, "logits/rejected": -0.22714976966381073, "logps/chosen": -4.681812286376953, "logps/rejected": -5.035951137542725, "loss": 0.0546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.681812286376953, "rewards/margins": 0.3541390299797058, "rewards/rejected": -5.035951137542725, "sft_loss": 4.365417957305908, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 0.7029362410222331, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.41014689207077026, "logits/rejected": -0.2868584990501404, "logps/chosen": -4.597723960876465, "logps/rejected": -4.962946891784668, "loss": 0.0529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.597723960876465, "rewards/margins": 0.3652224540710449, "rewards/rejected": -4.962946891784668, "sft_loss": 4.283852577209473, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 0.7647895308266636, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.5162456631660461, "logits/rejected": -0.043417371809482574, "logps/chosen": -4.48927640914917, "logps/rejected": -5.17399787902832, "loss": 0.0519, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.48927640914917, "rewards/margins": 0.6847215890884399, "rewards/rejected": -5.17399787902832, "sft_loss": 4.273918151855469, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 0.48886842225485777, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.3623657822608948, "logits/rejected": -0.2918470501899719, "logps/chosen": -4.423482418060303, "logps/rejected": -4.869231700897217, "loss": 0.0534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.423482418060303, "rewards/margins": 0.4457489550113678, "rewards/rejected": -4.869231700897217, "sft_loss": 4.199785232543945, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 0.5651971687665704, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.3085786700248718, "logits/rejected": -0.18533708155155182, "logps/chosen": -4.585659027099609, "logps/rejected": -4.924344062805176, "loss": 0.0531, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.585659027099609, "rewards/margins": 0.338684618473053, "rewards/rejected": -4.924344062805176, "sft_loss": 4.329022407531738, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 0.34537796990734165, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.4657418131828308, "logits/rejected": -0.21361199021339417, "logps/chosen": -4.482302665710449, "logps/rejected": -4.993915557861328, "loss": 0.0523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.482302665710449, "rewards/margins": 0.5116127729415894, "rewards/rejected": -4.993915557861328, "sft_loss": 4.212442874908447, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 0.6353485867641818, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.3445362150669098, "logits/rejected": -0.11221824586391449, "logps/chosen": -4.467150688171387, "logps/rejected": -4.882587909698486, "loss": 0.0526, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.467150688171387, "rewards/margins": 0.4154374599456787, "rewards/rejected": -4.882587909698486, "sft_loss": 4.144843101501465, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 0.7708747969935229, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.31633177399635315, "logits/rejected": -0.053336597979068756, "logps/chosen": -4.542233467102051, "logps/rejected": -5.012372016906738, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.542233467102051, "rewards/margins": 0.47013846039772034, "rewards/rejected": -5.012372016906738, "sft_loss": 4.1416215896606445, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 1.159682619717436, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.22155523300170898, "logits/rejected": -0.020089667290449142, "logps/chosen": -4.597943305969238, "logps/rejected": -5.248732089996338, "loss": 0.0524, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.597943305969238, "rewards/margins": 0.6507889628410339, "rewards/rejected": -5.248732089996338, "sft_loss": 4.222757816314697, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 0.8083866532979972, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.2432478666305542, "logits/rejected": -0.03565299138426781, "logps/chosen": -4.642041206359863, "logps/rejected": -5.277072906494141, "loss": 0.0516, "rewards/accuracies": 0.71875, "rewards/chosen": -4.642041206359863, "rewards/margins": 0.635032057762146, "rewards/rejected": -5.277072906494141, "sft_loss": 4.315460205078125, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 0.7014158947204401, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.4408930242061615, "logits/rejected": -0.046825408935546875, "logps/chosen": -4.495620250701904, "logps/rejected": -5.293448448181152, "loss": 0.0511, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.495620250701904, "rewards/margins": 0.7978277802467346, "rewards/rejected": -5.293448448181152, "sft_loss": 4.236871242523193, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 0.9608041180526156, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.37728947401046753, "logits/rejected": -0.00990285724401474, "logps/chosen": -4.229923248291016, "logps/rejected": -5.000068664550781, "loss": 0.0508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.229923248291016, "rewards/margins": 0.7701452970504761, "rewards/rejected": -5.000068664550781, "sft_loss": 3.8961243629455566, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 0.5225311878998186, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.20780956745147705, "logits/rejected": -0.12729039788246155, "logps/chosen": -4.6606974601745605, "logps/rejected": -5.065556526184082, "loss": 0.0538, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.6606974601745605, "rewards/margins": 0.40485963225364685, "rewards/rejected": -5.065556526184082, "sft_loss": 4.36079740524292, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 0.5773964237041671, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.25536245107650757, "logits/rejected": -0.1844777762889862, "logps/chosen": -4.609926223754883, "logps/rejected": -5.045127868652344, "loss": 0.0537, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.609926223754883, "rewards/margins": 0.4352017343044281, "rewards/rejected": -5.045127868652344, "sft_loss": 4.38587760925293, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 0.7640839157989326, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.3420313000679016, "logits/rejected": -0.23492303490638733, "logps/chosen": -4.852427005767822, "logps/rejected": -5.167304039001465, "loss": 0.0543, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.852427005767822, "rewards/margins": 0.31487753987312317, "rewards/rejected": -5.167304039001465, "sft_loss": 4.611882209777832, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 0.5554897907915322, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.4127574861049652, "logits/rejected": -0.2540976107120514, "logps/chosen": -4.4274468421936035, "logps/rejected": -4.706323146820068, "loss": 0.055, "rewards/accuracies": 0.625, "rewards/chosen": -4.4274468421936035, "rewards/margins": 0.2788761854171753, "rewards/rejected": -4.706323146820068, "sft_loss": 4.194740295410156, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 0.4757489938485724, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.38364917039871216, "logits/rejected": -0.012453851290047169, "logps/chosen": -4.412278652191162, "logps/rejected": -4.889250755310059, "loss": 0.0519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.412278652191162, "rewards/margins": 0.4769721031188965, "rewards/rejected": -4.889250755310059, "sft_loss": 4.152989387512207, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 0.4418324458419384, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.3964102864265442, "logits/rejected": -0.14374807476997375, "logps/chosen": -4.395001411437988, "logps/rejected": -5.0430474281311035, "loss": 0.0519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.395001411437988, "rewards/margins": 0.6480464935302734, "rewards/rejected": -5.0430474281311035, "sft_loss": 4.156543731689453, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 0.7293779185730094, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.42947110533714294, "logits/rejected": -0.3990377187728882, "logps/chosen": -4.6490654945373535, "logps/rejected": -4.978527545928955, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.6490654945373535, "rewards/margins": 0.3294626772403717, "rewards/rejected": -4.978527545928955, "sft_loss": 4.322227954864502, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 0.7207848366156323, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.3351363241672516, "logits/rejected": -0.2901817560195923, "logps/chosen": -4.815976619720459, "logps/rejected": -5.248465538024902, "loss": 0.0533, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.815976619720459, "rewards/margins": 0.43248969316482544, "rewards/rejected": -5.248465538024902, "sft_loss": 4.382091999053955, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 0.49484538896096164, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.49074602127075195, "logits/rejected": -0.2281077802181244, "logps/chosen": -4.58762264251709, "logps/rejected": -5.309582710266113, "loss": 0.0508, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.58762264251709, "rewards/margins": 0.7219597697257996, "rewards/rejected": -5.309582710266113, "sft_loss": 4.244722843170166, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.1344272345304489, "eval_logits/rejected": 0.2497575730085373, "eval_logps/chosen": -4.521986961364746, "eval_logps/rejected": -5.008147239685059, "eval_loss": 0.05116863548755646, "eval_rewards/accuracies": 0.6491097807884216, "eval_rewards/chosen": -4.521986961364746, "eval_rewards/margins": 0.4861602187156677, "eval_rewards/rejected": -5.008147239685059, "eval_runtime": 46.9917, "eval_samples_per_second": 28.622, "eval_sft_loss": 4.069005012512207, "eval_steps_per_second": 7.171, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 0.7700792987909814, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.4408310055732727, "logits/rejected": -0.11122790724039078, "logps/chosen": -4.3237690925598145, "logps/rejected": -4.888923168182373, "loss": 0.0514, "rewards/accuracies": 0.71875, "rewards/chosen": -4.3237690925598145, "rewards/margins": 0.5651546716690063, "rewards/rejected": -4.888923168182373, "sft_loss": 3.9699337482452393, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 0.4500857637288236, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.44538015127182007, "logits/rejected": -0.18109442293643951, "logps/chosen": -4.5957112312316895, "logps/rejected": -5.148885250091553, "loss": 0.0529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.5957112312316895, "rewards/margins": 0.5531740784645081, "rewards/rejected": -5.148885250091553, "sft_loss": 4.301665306091309, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 0.4853235079400437, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.31681299209594727, "logits/rejected": -0.1446036398410797, "logps/chosen": -4.822012901306152, "logps/rejected": -5.373905658721924, "loss": 0.053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.822012901306152, "rewards/margins": 0.5518924593925476, "rewards/rejected": -5.373905658721924, "sft_loss": 4.410131931304932, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 0.6305197858891428, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.28197187185287476, "logits/rejected": -0.013742757961153984, "logps/chosen": -4.69413948059082, "logps/rejected": -5.082732200622559, "loss": 0.0537, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.69413948059082, "rewards/margins": 0.3885928690433502, "rewards/rejected": -5.082732200622559, "sft_loss": 4.341333389282227, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 0.8534848769692264, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.2706916630268097, "logits/rejected": -0.190748929977417, "logps/chosen": -4.614697456359863, "logps/rejected": -4.935882091522217, "loss": 0.0548, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.614697456359863, "rewards/margins": 0.3211846649646759, "rewards/rejected": -4.935882091522217, "sft_loss": 4.341588973999023, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 0.5827853719471676, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.33987802267074585, "logits/rejected": -0.14849236607551575, "logps/chosen": -4.5428361892700195, "logps/rejected": -5.143988609313965, "loss": 0.0528, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.5428361892700195, "rewards/margins": 0.6011531352996826, "rewards/rejected": -5.143988609313965, "sft_loss": 4.327922344207764, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 0.6927078960178196, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.4181820750236511, "logits/rejected": -0.19935372471809387, "logps/chosen": -4.4195332527160645, "logps/rejected": -5.059715747833252, "loss": 0.0526, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.4195332527160645, "rewards/margins": 0.6401824355125427, "rewards/rejected": -5.059715747833252, "sft_loss": 4.210007667541504, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 0.6920786026169751, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.4544641375541687, "logits/rejected": -0.23299141228199005, "logps/chosen": -4.649049282073975, "logps/rejected": -5.273727893829346, "loss": 0.053, "rewards/accuracies": 0.65625, "rewards/chosen": -4.649049282073975, "rewards/margins": 0.624678909778595, "rewards/rejected": -5.273727893829346, "sft_loss": 4.335822105407715, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 0.7030352155468556, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.4106753468513489, "logits/rejected": -0.3475349247455597, "logps/chosen": -4.661510467529297, "logps/rejected": -4.993771076202393, "loss": 0.0545, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.661510467529297, "rewards/margins": 0.3322606682777405, "rewards/rejected": -4.993771076202393, "sft_loss": 4.4329681396484375, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 1.0133916891769041, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.2660573124885559, "logits/rejected": -0.24451692402362823, "logps/chosen": -4.353519439697266, "logps/rejected": -4.80262565612793, "loss": 0.0522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.353519439697266, "rewards/margins": 0.44910645484924316, "rewards/rejected": -4.80262565612793, "sft_loss": 4.052321910858154, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 0.38158224822284403, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.4017801284790039, "logits/rejected": -0.2663123607635498, "logps/chosen": -4.606898307800293, "logps/rejected": -5.036543846130371, "loss": 0.0524, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.606898307800293, "rewards/margins": 0.4296456277370453, "rewards/rejected": -5.036543846130371, "sft_loss": 4.341722011566162, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 0.7340126012410843, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.39234843850135803, "logits/rejected": -0.35262423753738403, "logps/chosen": -4.529763221740723, "logps/rejected": -5.034109115600586, "loss": 0.0526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.529763221740723, "rewards/margins": 0.5043456554412842, "rewards/rejected": -5.034109115600586, "sft_loss": 4.276070594787598, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 0.6370790584035594, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.19702807068824768, "logits/rejected": -0.1618746817111969, "logps/chosen": -4.514230728149414, "logps/rejected": -4.929436683654785, "loss": 0.0537, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.514230728149414, "rewards/margins": 0.4152059555053711, "rewards/rejected": -4.929436683654785, "sft_loss": 4.185002326965332, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 0.5247056428604062, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.1723632663488388, "logits/rejected": 0.06548583507537842, "logps/chosen": -4.680706977844238, "logps/rejected": -5.158249855041504, "loss": 0.0538, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.680706977844238, "rewards/margins": 0.4775429666042328, "rewards/rejected": -5.158249855041504, "sft_loss": 4.3549485206604, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 0.6651408971999766, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.33580222725868225, "logits/rejected": -0.09968128800392151, "logps/chosen": -4.62811803817749, "logps/rejected": -5.245832920074463, "loss": 0.0532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.62811803817749, "rewards/margins": 0.6177145838737488, "rewards/rejected": -5.245832920074463, "sft_loss": 4.451796054840088, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 0.4956364134232871, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.38807135820388794, "logits/rejected": -0.22442781925201416, "logps/chosen": -4.475585460662842, "logps/rejected": -4.869357109069824, "loss": 0.0545, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.475585460662842, "rewards/margins": 0.3937712609767914, "rewards/rejected": -4.869357109069824, "sft_loss": 4.2697858810424805, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 0.4533210270930005, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.37864798307418823, "logits/rejected": -0.06893188506364822, "logps/chosen": -4.537407398223877, "logps/rejected": -5.067080497741699, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.537407398223877, "rewards/margins": 0.5296733975410461, "rewards/rejected": -5.067080497741699, "sft_loss": 4.29558801651001, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 0.456636894174162, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.3543255925178528, "logits/rejected": -0.08065290749073029, "logps/chosen": -4.4864702224731445, "logps/rejected": -5.07802677154541, "loss": 0.051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.4864702224731445, "rewards/margins": 0.5915566682815552, "rewards/rejected": -5.07802677154541, "sft_loss": 4.127684593200684, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 0.7667898337451047, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.3511362671852112, "logits/rejected": -0.14278550446033478, "logps/chosen": -4.555008888244629, "logps/rejected": -5.053010940551758, "loss": 0.0523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.555008888244629, "rewards/margins": 0.4980013370513916, "rewards/rejected": -5.053010940551758, "sft_loss": 4.179459571838379, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 0.6244641339300205, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.15665873885154724, "logits/rejected": -0.2923945486545563, "logps/chosen": -4.709486961364746, "logps/rejected": -4.942587852478027, "loss": 0.0551, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.709486961364746, "rewards/margins": 0.23310072720050812, "rewards/rejected": -4.942587852478027, "sft_loss": 4.443274021148682, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 0.6618974427260913, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.22444629669189453, "logits/rejected": -0.08571354299783707, "logps/chosen": -4.598001480102539, "logps/rejected": -5.154397487640381, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.598001480102539, "rewards/margins": 0.5563960671424866, "rewards/rejected": -5.154397487640381, "sft_loss": 4.261946678161621, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 0.7041285882219205, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.26814156770706177, "logits/rejected": -0.09805545210838318, "logps/chosen": -4.7671709060668945, "logps/rejected": -5.136244773864746, "loss": 0.0529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.7671709060668945, "rewards/margins": 0.36907365918159485, "rewards/rejected": -5.136244773864746, "sft_loss": 4.420782566070557, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 0.9292121767573851, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.3112557530403137, "logits/rejected": -0.09291480481624603, "logps/chosen": -4.473947048187256, "logps/rejected": -5.047905445098877, "loss": 0.0525, "rewards/accuracies": 0.65625, "rewards/chosen": -4.473947048187256, "rewards/margins": 0.5739586353302002, "rewards/rejected": -5.047905445098877, "sft_loss": 4.1539106369018555, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 0.7171640716077882, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.17702648043632507, "logits/rejected": -0.07772380113601685, "logps/chosen": -4.466500282287598, "logps/rejected": -5.065943717956543, "loss": 0.0521, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.466500282287598, "rewards/margins": 0.5994431972503662, "rewards/rejected": -5.065943717956543, "sft_loss": 4.121184349060059, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 0.6258952957206926, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.16714437305927277, "logits/rejected": -0.03038867749273777, "logps/chosen": -4.553987503051758, "logps/rejected": -5.151804447174072, "loss": 0.0518, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.553987503051758, "rewards/margins": 0.5978171229362488, "rewards/rejected": -5.151804447174072, "sft_loss": 4.207757472991943, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 0.4969764494484655, "learning_rate": 8.731729746982068e-07, "logits/chosen": -0.21663089096546173, "logits/rejected": -0.16002297401428223, "logps/chosen": -4.529235363006592, "logps/rejected": -4.968966484069824, "loss": 0.0529, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.529235363006592, "rewards/margins": 0.4397306442260742, "rewards/rejected": -4.968966484069824, "sft_loss": 4.295144081115723, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 0.5729234869228408, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.43732938170433044, "logits/rejected": -0.10048327594995499, "logps/chosen": -4.420714378356934, "logps/rejected": -5.147778511047363, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.420714378356934, "rewards/margins": 0.7270635366439819, "rewards/rejected": -5.147778511047363, "sft_loss": 4.189997673034668, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 0.4876023798545788, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.3679789900779724, "logits/rejected": -0.10510773956775665, "logps/chosen": -4.425709247589111, "logps/rejected": -4.967562198638916, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.425709247589111, "rewards/margins": 0.5418528318405151, "rewards/rejected": -4.967562198638916, "sft_loss": 4.166138648986816, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 0.4988609878636019, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.11972793191671371, "logits/rejected": -0.1516149938106537, "logps/chosen": -4.627892017364502, "logps/rejected": -5.007716178894043, "loss": 0.0534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.627892017364502, "rewards/margins": 0.37982410192489624, "rewards/rejected": -5.007716178894043, "sft_loss": 4.289733409881592, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 0.6138860861212869, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.2044331133365631, "logits/rejected": -0.12514927983283997, "logps/chosen": -4.860374927520752, "logps/rejected": -5.20972204208374, "loss": 0.0556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.860374927520752, "rewards/margins": 0.34934720396995544, "rewards/rejected": -5.20972204208374, "sft_loss": 4.611658573150635, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 0.6603699568715765, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.27209264039993286, "logits/rejected": -0.11125469207763672, "logps/chosen": -4.411839008331299, "logps/rejected": -4.948624610900879, "loss": 0.0517, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.411839008331299, "rewards/margins": 0.5367849469184875, "rewards/rejected": -4.948624610900879, "sft_loss": 4.160839080810547, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 0.8262536390192445, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.2572443187236786, "logits/rejected": -0.04624100401997566, "logps/chosen": -4.364460468292236, "logps/rejected": -4.834274768829346, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.364460468292236, "rewards/margins": 0.46981415152549744, "rewards/rejected": -4.834274768829346, "sft_loss": 4.102207183837891, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 0.6191068674758381, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.15403051674365997, "logits/rejected": -0.0583893358707428, "logps/chosen": -4.623048782348633, "logps/rejected": -5.179881572723389, "loss": 0.0536, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.623048782348633, "rewards/margins": 0.5568326711654663, "rewards/rejected": -5.179881572723389, "sft_loss": 4.381789684295654, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 0.43461183223783423, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.17942842841148376, "logits/rejected": 0.036060623824596405, "logps/chosen": -4.722434997558594, "logps/rejected": -5.11240291595459, "loss": 0.0538, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.722434997558594, "rewards/margins": 0.3899684250354767, "rewards/rejected": -5.11240291595459, "sft_loss": 4.488548278808594, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 0.5292183247284945, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.3148379325866699, "logits/rejected": -0.12622442841529846, "logps/chosen": -4.515315532684326, "logps/rejected": -5.078459739685059, "loss": 0.0523, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.515315532684326, "rewards/margins": 0.5631445646286011, "rewards/rejected": -5.078459739685059, "sft_loss": 4.220766067504883, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 0.8995848344447617, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.30298176407814026, "logits/rejected": -0.15992002189159393, "logps/chosen": -4.513458251953125, "logps/rejected": -5.091177940368652, "loss": 0.0527, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.513458251953125, "rewards/margins": 0.5777191519737244, "rewards/rejected": -5.091177940368652, "sft_loss": 4.2787041664123535, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 0.567681451550632, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.2650911211967468, "logits/rejected": -0.19625148177146912, "logps/chosen": -4.545306205749512, "logps/rejected": -4.82927942276001, "loss": 0.0544, "rewards/accuracies": 0.59375, "rewards/chosen": -4.545306205749512, "rewards/margins": 0.2839727997779846, "rewards/rejected": -4.82927942276001, "sft_loss": 4.279045104980469, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 0.4350566579254937, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.2149418294429779, "logits/rejected": -0.07808341085910797, "logps/chosen": -4.562401294708252, "logps/rejected": -5.076463222503662, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.562401294708252, "rewards/margins": 0.5140615701675415, "rewards/rejected": -5.076463222503662, "sft_loss": 4.298698902130127, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 0.4812827472314279, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.35899245738983154, "logits/rejected": -0.18311157822608948, "logps/chosen": -4.479128837585449, "logps/rejected": -5.071837425231934, "loss": 0.0521, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.479128837585449, "rewards/margins": 0.592708170413971, "rewards/rejected": -5.071837425231934, "sft_loss": 4.253784656524658, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 0.5327496826275652, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.37001967430114746, "logits/rejected": -0.3091837465763092, "logps/chosen": -4.516618251800537, "logps/rejected": -4.93643856048584, "loss": 0.0532, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.516618251800537, "rewards/margins": 0.4198206961154938, "rewards/rejected": -4.93643856048584, "sft_loss": 4.2589216232299805, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 0.802834245743257, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.4872976243495941, "logits/rejected": -0.32987093925476074, "logps/chosen": -4.4331231117248535, "logps/rejected": -5.15014123916626, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.4331231117248535, "rewards/margins": 0.7170186042785645, "rewards/rejected": -5.15014123916626, "sft_loss": 4.194850921630859, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 0.6878570030804108, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.38983696699142456, "logits/rejected": -0.22500252723693848, "logps/chosen": -4.533890724182129, "logps/rejected": -5.05610466003418, "loss": 0.052, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.533890724182129, "rewards/margins": 0.5222145318984985, "rewards/rejected": -5.05610466003418, "sft_loss": 4.274322032928467, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 0.5121223276944379, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.2688700556755066, "logits/rejected": -0.09521832317113876, "logps/chosen": -4.299901485443115, "logps/rejected": -4.9609694480896, "loss": 0.051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.299901485443115, "rewards/margins": 0.6610682606697083, "rewards/rejected": -4.9609694480896, "sft_loss": 4.028165817260742, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 0.6048520534653532, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.2053041160106659, "logits/rejected": -0.16262459754943848, "logps/chosen": -4.482122421264648, "logps/rejected": -5.020371913909912, "loss": 0.0524, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.482122421264648, "rewards/margins": 0.5382490754127502, "rewards/rejected": -5.020371913909912, "sft_loss": 4.157798767089844, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 0.4803677280460752, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.30528777837753296, "logits/rejected": -0.06126274913549423, "logps/chosen": -4.638838768005371, "logps/rejected": -5.484151363372803, "loss": 0.0524, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.638838768005371, "rewards/margins": 0.8453127145767212, "rewards/rejected": -5.484151363372803, "sft_loss": 4.383290767669678, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 0.441717668108046, "learning_rate": 8.51731666796467e-07, "logits/chosen": -0.1470208615064621, "logits/rejected": -0.09497350454330444, "logps/chosen": -4.461791515350342, "logps/rejected": -4.9654388427734375, "loss": 0.0524, "rewards/accuracies": 0.65625, "rewards/chosen": -4.461791515350342, "rewards/margins": 0.5036473274230957, "rewards/rejected": -4.9654388427734375, "sft_loss": 4.162781715393066, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 0.46083392095277986, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.2915559411048889, "logits/rejected": -0.18692262470722198, "logps/chosen": -4.589268684387207, "logps/rejected": -5.0538201332092285, "loss": 0.0521, "rewards/accuracies": 0.65625, "rewards/chosen": -4.589268684387207, "rewards/margins": 0.46455103158950806, "rewards/rejected": -5.0538201332092285, "sft_loss": 4.349760055541992, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 0.4360499079412217, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.3233645558357239, "logits/rejected": -0.06588099151849747, "logps/chosen": -4.448300361633301, "logps/rejected": -4.996584415435791, "loss": 0.0535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.448300361633301, "rewards/margins": 0.5482843518257141, "rewards/rejected": -4.996584415435791, "sft_loss": 4.263411521911621, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 0.5306953171524185, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.22878269851207733, "logits/rejected": -0.11855238676071167, "logps/chosen": -4.591235160827637, "logps/rejected": -4.991271018981934, "loss": 0.0527, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.591235160827637, "rewards/margins": 0.40003618597984314, "rewards/rejected": -4.991271018981934, "sft_loss": 4.270682334899902, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 0.3833376083394722, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.5186641216278076, "logits/rejected": -0.3550383448600769, "logps/chosen": -4.492537021636963, "logps/rejected": -4.917145729064941, "loss": 0.0536, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.492537021636963, "rewards/margins": 0.42460840940475464, "rewards/rejected": -4.917145729064941, "sft_loss": 4.207838535308838, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 0.43893085734184745, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.3054492473602295, "logits/rejected": -0.1130295991897583, "logps/chosen": -4.558745861053467, "logps/rejected": -5.160397529602051, "loss": 0.0504, "rewards/accuracies": 0.6875, "rewards/chosen": -4.558745861053467, "rewards/margins": 0.6016519069671631, "rewards/rejected": -5.160397529602051, "sft_loss": 4.105099201202393, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 0.5769251490857489, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.3264893889427185, "logits/rejected": -0.18141944706439972, "logps/chosen": -4.5322265625, "logps/rejected": -5.082207679748535, "loss": 0.0509, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.5322265625, "rewards/margins": 0.5499812960624695, "rewards/rejected": -5.082207679748535, "sft_loss": 4.100834846496582, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 0.5133368713565196, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.28852778673171997, "logits/rejected": -0.2075691968202591, "logps/chosen": -4.686163902282715, "logps/rejected": -5.1576972007751465, "loss": 0.052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.686163902282715, "rewards/margins": 0.4715335965156555, "rewards/rejected": -5.1576972007751465, "sft_loss": 4.286208152770996, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 0.4180399710199207, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.35929030179977417, "logits/rejected": -0.1946285218000412, "logps/chosen": -4.443009853363037, "logps/rejected": -5.0500898361206055, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.443009853363037, "rewards/margins": 0.6070801019668579, "rewards/rejected": -5.0500898361206055, "sft_loss": 4.139133453369141, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 0.5503632976186634, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.4894943833351135, "logits/rejected": -0.28924956917762756, "logps/chosen": -4.698008060455322, "logps/rejected": -5.052058696746826, "loss": 0.0537, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.698008060455322, "rewards/margins": 0.3540504276752472, "rewards/rejected": -5.052058696746826, "sft_loss": 4.405055046081543, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 0.45174404828507203, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.30120593309402466, "logits/rejected": -0.164491206407547, "logps/chosen": -4.559738636016846, "logps/rejected": -5.136922836303711, "loss": 0.0513, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.559738636016846, "rewards/margins": 0.5771840214729309, "rewards/rejected": -5.136922836303711, "sft_loss": 4.184256076812744, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 0.34956396396189543, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.3214988112449646, "logits/rejected": -0.2575382888317108, "logps/chosen": -4.629532814025879, "logps/rejected": -5.043061256408691, "loss": 0.0534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.629532814025879, "rewards/margins": 0.4135282635688782, "rewards/rejected": -5.043061256408691, "sft_loss": 4.252536773681641, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 0.6764223132300197, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.3845653831958771, "logits/rejected": -0.08716684579849243, "logps/chosen": -4.3737688064575195, "logps/rejected": -4.897772312164307, "loss": 0.0524, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.3737688064575195, "rewards/margins": 0.5240030884742737, "rewards/rejected": -4.897772312164307, "sft_loss": 4.154975891113281, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 0.5770424148011009, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.27163565158843994, "logits/rejected": -0.2584533095359802, "logps/chosen": -4.548377990722656, "logps/rejected": -4.94322395324707, "loss": 0.0522, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.548377990722656, "rewards/margins": 0.39484524726867676, "rewards/rejected": -4.94322395324707, "sft_loss": 4.188554763793945, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 0.5114431601672031, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.4921305775642395, "logits/rejected": -0.08544237911701202, "logps/chosen": -4.601258277893066, "logps/rejected": -5.190618991851807, "loss": 0.0514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.601258277893066, "rewards/margins": 0.5893611907958984, "rewards/rejected": -5.190618991851807, "sft_loss": 4.346938133239746, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 0.9844192428537916, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.21924248337745667, "logits/rejected": -0.037383563816547394, "logps/chosen": -4.587491989135742, "logps/rejected": -4.976442813873291, "loss": 0.0542, "rewards/accuracies": 0.59375, "rewards/chosen": -4.587491989135742, "rewards/margins": 0.38895124197006226, "rewards/rejected": -4.976442813873291, "sft_loss": 4.243929386138916, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 0.524874546523782, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.3437415063381195, "logits/rejected": -0.17452563345432281, "logps/chosen": -4.569422721862793, "logps/rejected": -4.986387252807617, "loss": 0.0534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.569422721862793, "rewards/margins": 0.41696444153785706, "rewards/rejected": -4.986387252807617, "sft_loss": 4.248903751373291, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 0.35099039329842446, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.32231372594833374, "logits/rejected": -0.15310195088386536, "logps/chosen": -4.564090728759766, "logps/rejected": -5.047003269195557, "loss": 0.0519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.564090728759766, "rewards/margins": 0.482913076877594, "rewards/rejected": -5.047003269195557, "sft_loss": 4.228408336639404, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 0.2853568267955546, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.3368076980113983, "logits/rejected": -0.21735279262065887, "logps/chosen": -4.621280670166016, "logps/rejected": -5.160231590270996, "loss": 0.0524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.621280670166016, "rewards/margins": 0.5389507412910461, "rewards/rejected": -5.160231590270996, "sft_loss": 4.306357383728027, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 0.4608071152061401, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.4223474860191345, "logits/rejected": -0.10327117145061493, "logps/chosen": -4.403355598449707, "logps/rejected": -4.978325843811035, "loss": 0.0511, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.403355598449707, "rewards/margins": 0.5749701857566833, "rewards/rejected": -4.978325843811035, "sft_loss": 4.0618391036987305, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 0.5609992865362639, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.23218891024589539, "logits/rejected": 1.4790147361054551e-05, "logps/chosen": -4.453673839569092, "logps/rejected": -5.110269546508789, "loss": 0.0518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.453673839569092, "rewards/margins": 0.6565961837768555, "rewards/rejected": -5.110269546508789, "sft_loss": 4.1434006690979, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 0.5850822593289026, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.2841840386390686, "logits/rejected": 0.012512536719441414, "logps/chosen": -4.500065803527832, "logps/rejected": -5.142590522766113, "loss": 0.0526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.500065803527832, "rewards/margins": 0.6425246596336365, "rewards/rejected": -5.142590522766113, "sft_loss": 4.258237838745117, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 0.7141136511961801, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.1625567376613617, "logits/rejected": -0.03078523278236389, "logps/chosen": -4.5068159103393555, "logps/rejected": -5.23297119140625, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.5068159103393555, "rewards/margins": 0.7261554598808289, "rewards/rejected": -5.23297119140625, "sft_loss": 4.137577056884766, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 0.43605856386239633, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.28013330698013306, "logits/rejected": -0.07744655758142471, "logps/chosen": -4.623358249664307, "logps/rejected": -5.3566060066223145, "loss": 0.0521, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.623358249664307, "rewards/margins": 0.7332478165626526, "rewards/rejected": -5.3566060066223145, "sft_loss": 4.287482738494873, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 0.39378033583895794, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.21353749930858612, "logits/rejected": 0.08030889183282852, "logps/chosen": -4.414605140686035, "logps/rejected": -5.014411449432373, "loss": 0.0523, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.414605140686035, "rewards/margins": 0.5998064279556274, "rewards/rejected": -5.014411449432373, "sft_loss": 4.133689880371094, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 0.4990210435130263, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.35716816782951355, "logits/rejected": -0.269197553396225, "logps/chosen": -4.437384605407715, "logps/rejected": -4.960318088531494, "loss": 0.0529, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.437384605407715, "rewards/margins": 0.5229335427284241, "rewards/rejected": -4.960318088531494, "sft_loss": 4.239546775817871, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 0.35144504822556205, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.24343474209308624, "logits/rejected": -0.2140563428401947, "logps/chosen": -4.5842461585998535, "logps/rejected": -4.974714756011963, "loss": 0.0528, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.5842461585998535, "rewards/margins": 0.3904687762260437, "rewards/rejected": -4.974714756011963, "sft_loss": 4.3355817794799805, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 0.5694804322434417, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.2745942175388336, "logits/rejected": -0.18785987794399261, "logps/chosen": -4.44666051864624, "logps/rejected": -5.071129322052002, "loss": 0.0514, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.44666051864624, "rewards/margins": 0.6244686841964722, "rewards/rejected": -5.071129322052002, "sft_loss": 4.212424278259277, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 0.5013521036883045, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.3253830075263977, "logits/rejected": -0.11694759130477905, "logps/chosen": -4.50570821762085, "logps/rejected": -5.062636852264404, "loss": 0.052, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.50570821762085, "rewards/margins": 0.5569278001785278, "rewards/rejected": -5.062636852264404, "sft_loss": 4.15521764755249, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 0.4268697531621967, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.14694912731647491, "logits/rejected": -0.08554248511791229, "logps/chosen": -4.438208103179932, "logps/rejected": -5.099421501159668, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.438208103179932, "rewards/margins": 0.6612135767936707, "rewards/rejected": -5.099421501159668, "sft_loss": 4.202122688293457, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 0.5113103606821928, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.20020289719104767, "logits/rejected": -0.05032138153910637, "logps/chosen": -4.628384113311768, "logps/rejected": -5.130135536193848, "loss": 0.0525, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.628384113311768, "rewards/margins": 0.5017513036727905, "rewards/rejected": -5.130135536193848, "sft_loss": 4.327366828918457, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 0.451200741295089, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.28399142622947693, "logits/rejected": -0.098796546459198, "logps/chosen": -4.541468620300293, "logps/rejected": -4.977931022644043, "loss": 0.0519, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.541468620300293, "rewards/margins": 0.43646302819252014, "rewards/rejected": -4.977931022644043, "sft_loss": 4.1500091552734375, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 0.5048494393677178, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.3281692564487457, "logits/rejected": -0.16416522860527039, "logps/chosen": -4.572214126586914, "logps/rejected": -5.139347076416016, "loss": 0.0528, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.572214126586914, "rewards/margins": 0.5671325922012329, "rewards/rejected": -5.139347076416016, "sft_loss": 4.260963439941406, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 0.7189935633369484, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.2590765953063965, "logits/rejected": -0.04907204583287239, "logps/chosen": -4.604923248291016, "logps/rejected": -5.234789848327637, "loss": 0.0524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.604923248291016, "rewards/margins": 0.6298665404319763, "rewards/rejected": -5.234789848327637, "sft_loss": 4.281820774078369, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 0.34328131015830476, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.40063053369522095, "logits/rejected": -0.19102320075035095, "logps/chosen": -4.727412700653076, "logps/rejected": -5.302260398864746, "loss": 0.0529, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.727412700653076, "rewards/margins": 0.5748476386070251, "rewards/rejected": -5.302260398864746, "sft_loss": 4.363173007965088, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.2181275635957718, "eval_logits/rejected": 0.32679614424705505, "eval_logps/chosen": -4.391653060913086, "eval_logps/rejected": -4.964575290679932, "eval_loss": 0.050767455250024796, "eval_rewards/accuracies": 0.6520771384239197, "eval_rewards/chosen": -4.391653060913086, "eval_rewards/margins": 0.5729230046272278, "eval_rewards/rejected": -4.964575290679932, "eval_runtime": 46.8943, "eval_samples_per_second": 28.681, "eval_sft_loss": 3.9195380210876465, "eval_steps_per_second": 7.186, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 0.5010503749955464, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.2518996596336365, "logits/rejected": -0.21349790692329407, "logps/chosen": -4.4844794273376465, "logps/rejected": -4.871996879577637, "loss": 0.0534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.4844794273376465, "rewards/margins": 0.3875174820423126, "rewards/rejected": -4.871996879577637, "sft_loss": 4.148735046386719, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 0.5267306082118653, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.3113710582256317, "logits/rejected": -0.1672811210155487, "logps/chosen": -4.509435176849365, "logps/rejected": -5.137767314910889, "loss": 0.051, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.509435176849365, "rewards/margins": 0.628332257270813, "rewards/rejected": -5.137767314910889, "sft_loss": 4.157217979431152, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 0.6431378047075401, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.26741084456443787, "logits/rejected": -0.12249056994915009, "logps/chosen": -4.71677827835083, "logps/rejected": -5.193902015686035, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.71677827835083, "rewards/margins": 0.4771236479282379, "rewards/rejected": -5.193902015686035, "sft_loss": 4.391667366027832, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 0.6615586107166485, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.4982661306858063, "logits/rejected": -0.18780794739723206, "logps/chosen": -4.37764310836792, "logps/rejected": -5.1312479972839355, "loss": 0.0502, "rewards/accuracies": 0.78125, "rewards/chosen": -4.37764310836792, "rewards/margins": 0.7536051869392395, "rewards/rejected": -5.1312479972839355, "sft_loss": 4.143968105316162, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 0.8018527086095957, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.27660831809043884, "logits/rejected": -0.07096768170595169, "logps/chosen": -4.368846416473389, "logps/rejected": -5.057332515716553, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.368846416473389, "rewards/margins": 0.6884865164756775, "rewards/rejected": -5.057332515716553, "sft_loss": 4.087088584899902, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 0.39634613130322927, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.27290281653404236, "logits/rejected": -0.27338218688964844, "logps/chosen": -4.375092506408691, "logps/rejected": -4.8426008224487305, "loss": 0.0532, "rewards/accuracies": 0.625, "rewards/chosen": -4.375092506408691, "rewards/margins": 0.4675084948539734, "rewards/rejected": -4.8426008224487305, "sft_loss": 4.101852893829346, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 0.41087822652882344, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.29975003004074097, "logits/rejected": -0.10043933242559433, "logps/chosen": -4.682087421417236, "logps/rejected": -5.2238993644714355, "loss": 0.0527, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.682087421417236, "rewards/margins": 0.5418123006820679, "rewards/rejected": -5.2238993644714355, "sft_loss": 4.406389236450195, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 0.5169857197006602, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.2759687006473541, "logits/rejected": -0.25068631768226624, "logps/chosen": -4.628575325012207, "logps/rejected": -5.116905689239502, "loss": 0.0514, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.628575325012207, "rewards/margins": 0.48833027482032776, "rewards/rejected": -5.116905689239502, "sft_loss": 4.2828264236450195, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 1.3389099369822377, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.1845565289258957, "logits/rejected": -0.12967592477798462, "logps/chosen": -4.451420307159424, "logps/rejected": -4.993684768676758, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.451420307159424, "rewards/margins": 0.5422651171684265, "rewards/rejected": -4.993684768676758, "sft_loss": 4.071597099304199, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 0.4949625331470205, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.3145469129085541, "logits/rejected": -0.1309657245874405, "logps/chosen": -4.390885829925537, "logps/rejected": -4.963167667388916, "loss": 0.0533, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.390885829925537, "rewards/margins": 0.5722818374633789, "rewards/rejected": -4.963167667388916, "sft_loss": 4.19434118270874, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 0.6295468994335813, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.32317155599594116, "logits/rejected": -0.15155306458473206, "logps/chosen": -4.628039360046387, "logps/rejected": -5.162529945373535, "loss": 0.0528, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.628039360046387, "rewards/margins": 0.5344905257225037, "rewards/rejected": -5.162529945373535, "sft_loss": 4.412549018859863, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 0.4033918872706689, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.21534164249897003, "logits/rejected": -0.13639280200004578, "logps/chosen": -4.623472690582275, "logps/rejected": -5.190655708312988, "loss": 0.0516, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.623472690582275, "rewards/margins": 0.5671836733818054, "rewards/rejected": -5.190655708312988, "sft_loss": 4.301222324371338, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 0.5476916192913638, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.27661851048469543, "logits/rejected": -0.10544946044683456, "logps/chosen": -4.245184898376465, "logps/rejected": -4.931614875793457, "loss": 0.0515, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.245184898376465, "rewards/margins": 0.6864299774169922, "rewards/rejected": -4.931614875793457, "sft_loss": 3.9942116737365723, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 0.4772096862444193, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.22163578867912292, "logits/rejected": -0.03878837823867798, "logps/chosen": -4.369515419006348, "logps/rejected": -5.048583030700684, "loss": 0.0507, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.369515419006348, "rewards/margins": 0.6790679693222046, "rewards/rejected": -5.048583030700684, "sft_loss": 4.050380706787109, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 0.5373686869569195, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.3304591178894043, "logits/rejected": -0.2032184600830078, "logps/chosen": -4.617485046386719, "logps/rejected": -5.059828758239746, "loss": 0.053, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.617485046386719, "rewards/margins": 0.44234347343444824, "rewards/rejected": -5.059828758239746, "sft_loss": 4.327376842498779, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 0.4664562059007877, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.3769041895866394, "logits/rejected": -0.15755943953990936, "logps/chosen": -4.339583396911621, "logps/rejected": -4.930947780609131, "loss": 0.0524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.339583396911621, "rewards/margins": 0.591364860534668, "rewards/rejected": -4.930947780609131, "sft_loss": 4.0584540367126465, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 0.6250482398641047, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.2822895050048828, "logits/rejected": -0.19432392716407776, "logps/chosen": -4.452942371368408, "logps/rejected": -5.425656318664551, "loss": 0.0509, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.452942371368408, "rewards/margins": 0.9727136492729187, "rewards/rejected": -5.425656318664551, "sft_loss": 4.201419353485107, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 0.8350884908004433, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.2402046173810959, "logits/rejected": -0.26276570558547974, "logps/chosen": -4.695631980895996, "logps/rejected": -5.093986511230469, "loss": 0.0525, "rewards/accuracies": 0.625, "rewards/chosen": -4.695631980895996, "rewards/margins": 0.39835453033447266, "rewards/rejected": -5.093986511230469, "sft_loss": 4.272342681884766, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 0.49061162342011755, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.4282917380332947, "logits/rejected": -0.2478407621383667, "logps/chosen": -4.363981246948242, "logps/rejected": -5.022068977355957, "loss": 0.052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.363981246948242, "rewards/margins": 0.6580876708030701, "rewards/rejected": -5.022068977355957, "sft_loss": 4.1623854637146, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 0.4053383565155827, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.20890608429908752, "logits/rejected": -0.1709698736667633, "logps/chosen": -4.5597615242004395, "logps/rejected": -4.990670204162598, "loss": 0.0521, "rewards/accuracies": 0.65625, "rewards/chosen": -4.5597615242004395, "rewards/margins": 0.43090900778770447, "rewards/rejected": -4.990670204162598, "sft_loss": 4.204160213470459, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 0.5077357597268557, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.17620953917503357, "logits/rejected": 0.038376279175281525, "logps/chosen": -4.530808925628662, "logps/rejected": -5.178600788116455, "loss": 0.0521, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.530808925628662, "rewards/margins": 0.6477917432785034, "rewards/rejected": -5.178600788116455, "sft_loss": 4.258144378662109, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 0.535113113045043, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.18129639327526093, "logits/rejected": -0.02067035436630249, "logps/chosen": -4.638469219207764, "logps/rejected": -5.311728000640869, "loss": 0.053, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.638469219207764, "rewards/margins": 0.6732583045959473, "rewards/rejected": -5.311728000640869, "sft_loss": 4.301131248474121, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 0.6531795517975728, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.1367073506116867, "logits/rejected": -0.10230563580989838, "logps/chosen": -4.544414043426514, "logps/rejected": -5.14690637588501, "loss": 0.0526, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.544414043426514, "rewards/margins": 0.6024927496910095, "rewards/rejected": -5.14690637588501, "sft_loss": 4.306552886962891, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 0.4472285225867335, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.28611913323402405, "logits/rejected": -0.26917344331741333, "logps/chosen": -4.406510353088379, "logps/rejected": -4.875698566436768, "loss": 0.0532, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.406510353088379, "rewards/margins": 0.4691886305809021, "rewards/rejected": -4.875698566436768, "sft_loss": 4.193192481994629, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 0.44877368661612094, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.3733294904232025, "logits/rejected": -0.2503949999809265, "logps/chosen": -4.5276312828063965, "logps/rejected": -5.178982734680176, "loss": 0.0532, "rewards/accuracies": 0.71875, "rewards/chosen": -4.5276312828063965, "rewards/margins": 0.6513513326644897, "rewards/rejected": -5.178982734680176, "sft_loss": 4.268059730529785, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 0.5827149114778036, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.21721968054771423, "logits/rejected": 0.10426320880651474, "logps/chosen": -4.440943717956543, "logps/rejected": -5.223211288452148, "loss": 0.0497, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.440943717956543, "rewards/margins": 0.782267689704895, "rewards/rejected": -5.223211288452148, "sft_loss": 4.089264869689941, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 0.5107309445628425, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.24261124432086945, "logits/rejected": 0.019307659938931465, "logps/chosen": -4.272761821746826, "logps/rejected": -5.059557914733887, "loss": 0.052, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.272761821746826, "rewards/margins": 0.7867968082427979, "rewards/rejected": -5.059557914733887, "sft_loss": 4.039781093597412, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 0.6359037944598581, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.18060888350009918, "logits/rejected": -0.014508080668747425, "logps/chosen": -4.498339653015137, "logps/rejected": -5.16357421875, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.498339653015137, "rewards/margins": 0.6652345657348633, "rewards/rejected": -5.16357421875, "sft_loss": 4.235073566436768, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 0.6393126236107872, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.15019895136356354, "logits/rejected": -0.05985778570175171, "logps/chosen": -4.496522426605225, "logps/rejected": -5.198064804077148, "loss": 0.0504, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.496522426605225, "rewards/margins": 0.7015424966812134, "rewards/rejected": -5.198064804077148, "sft_loss": 4.151566505432129, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 0.5093607606357484, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.20116212964057922, "logits/rejected": -0.1553642749786377, "logps/chosen": -4.311014652252197, "logps/rejected": -5.0506696701049805, "loss": 0.0521, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.311014652252197, "rewards/margins": 0.7396548986434937, "rewards/rejected": -5.0506696701049805, "sft_loss": 4.00994348526001, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 0.5657638745166378, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.24293391406536102, "logits/rejected": -0.13297154009342194, "logps/chosen": -4.716964244842529, "logps/rejected": -5.143126487731934, "loss": 0.0532, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.716964244842529, "rewards/margins": 0.42616158723831177, "rewards/rejected": -5.143126487731934, "sft_loss": 4.424800872802734, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 0.4765104519295766, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.3335570693016052, "logits/rejected": -0.25355204939842224, "logps/chosen": -4.636630058288574, "logps/rejected": -5.252082347869873, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.636630058288574, "rewards/margins": 0.6154532432556152, "rewards/rejected": -5.252082347869873, "sft_loss": 4.426233768463135, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 0.44084472243021816, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.3639964461326599, "logits/rejected": -0.12874539196491241, "logps/chosen": -4.40498685836792, "logps/rejected": -4.935255527496338, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.40498685836792, "rewards/margins": 0.5302689671516418, "rewards/rejected": -4.935255527496338, "sft_loss": 4.194338798522949, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 0.4315771678550106, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.2753906846046448, "logits/rejected": -0.08385102450847626, "logps/chosen": -4.243175506591797, "logps/rejected": -4.9434309005737305, "loss": 0.051, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.243175506591797, "rewards/margins": 0.7002550363540649, "rewards/rejected": -4.9434309005737305, "sft_loss": 4.0009446144104, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 0.5705012184083136, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.3476327359676361, "logits/rejected": -0.07195943593978882, "logps/chosen": -4.406363487243652, "logps/rejected": -4.905777454376221, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.406363487243652, "rewards/margins": 0.4994131922721863, "rewards/rejected": -4.905777454376221, "sft_loss": 4.096680641174316, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 0.4734632097680218, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.22654542326927185, "logits/rejected": -0.13219550251960754, "logps/chosen": -4.476452827453613, "logps/rejected": -5.189483642578125, "loss": 0.0505, "rewards/accuracies": 0.75, "rewards/chosen": -4.476452827453613, "rewards/margins": 0.7130311727523804, "rewards/rejected": -5.189483642578125, "sft_loss": 4.207291603088379, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 0.6920160565963868, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.16666167974472046, "logits/rejected": 0.041519664227962494, "logps/chosen": -4.522221565246582, "logps/rejected": -5.004232883453369, "loss": 0.0522, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.522221565246582, "rewards/margins": 0.4820104241371155, "rewards/rejected": -5.004232883453369, "sft_loss": 4.196804523468018, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 0.5798926531076499, "learning_rate": 7.638933899585354e-07, "logits/chosen": -0.017039481550455093, "logits/rejected": -0.031264323741197586, "logps/chosen": -4.368385314941406, "logps/rejected": -5.084227085113525, "loss": 0.0512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.368385314941406, "rewards/margins": 0.7158415913581848, "rewards/rejected": -5.084227085113525, "sft_loss": 4.072343826293945, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 0.8352305328176775, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.22756394743919373, "logits/rejected": -0.10421310365200043, "logps/chosen": -4.519444465637207, "logps/rejected": -5.141909599304199, "loss": 0.0548, "rewards/accuracies": 0.5625, "rewards/chosen": -4.519444465637207, "rewards/margins": 0.6224651336669922, "rewards/rejected": -5.141909599304199, "sft_loss": 4.257163047790527, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 0.6049852133055805, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.2685548663139343, "logits/rejected": -0.305839478969574, "logps/chosen": -4.889684677124023, "logps/rejected": -5.2240166664123535, "loss": 0.0537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.889684677124023, "rewards/margins": 0.334332138299942, "rewards/rejected": -5.2240166664123535, "sft_loss": 4.526303768157959, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 0.3697278141071938, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.4287821650505066, "logits/rejected": -0.1586742401123047, "logps/chosen": -4.467938423156738, "logps/rejected": -5.110435485839844, "loss": 0.0519, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.467938423156738, "rewards/margins": 0.6424973607063293, "rewards/rejected": -5.110435485839844, "sft_loss": 4.301453590393066, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 0.4839978796312147, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.411714643239975, "logits/rejected": -0.2825588881969452, "logps/chosen": -4.301059722900391, "logps/rejected": -4.954368591308594, "loss": 0.0513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.301059722900391, "rewards/margins": 0.6533088684082031, "rewards/rejected": -4.954368591308594, "sft_loss": 4.1210832595825195, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 0.4909297019707165, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.3669358491897583, "logits/rejected": -0.24129393696784973, "logps/chosen": -4.29843807220459, "logps/rejected": -4.948997974395752, "loss": 0.0512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.29843807220459, "rewards/margins": 0.6505595445632935, "rewards/rejected": -4.948997974395752, "sft_loss": 4.052264213562012, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 0.43671107394456493, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.4114173352718353, "logits/rejected": -0.2702207565307617, "logps/chosen": -4.2567853927612305, "logps/rejected": -4.826117515563965, "loss": 0.0531, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.2567853927612305, "rewards/margins": 0.5693323016166687, "rewards/rejected": -4.826117515563965, "sft_loss": 4.011662006378174, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 0.5556279783763788, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.4900270998477936, "logits/rejected": -0.13813424110412598, "logps/chosen": -4.605090141296387, "logps/rejected": -5.394580841064453, "loss": 0.052, "rewards/accuracies": 0.71875, "rewards/chosen": -4.605090141296387, "rewards/margins": 0.7894911766052246, "rewards/rejected": -5.394580841064453, "sft_loss": 4.288597106933594, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 0.5509398392055068, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.40336865186691284, "logits/rejected": -0.1709717959165573, "logps/chosen": -4.6631364822387695, "logps/rejected": -5.51657772064209, "loss": 0.0516, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.6631364822387695, "rewards/margins": 0.8534411191940308, "rewards/rejected": -5.51657772064209, "sft_loss": 4.334843158721924, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 0.5955483599486879, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.3279435634613037, "logits/rejected": -0.07166824489831924, "logps/chosen": -4.5641679763793945, "logps/rejected": -5.14822244644165, "loss": 0.0519, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.5641679763793945, "rewards/margins": 0.5840541124343872, "rewards/rejected": -5.14822244644165, "sft_loss": 4.200583457946777, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 0.43430584417433116, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.3868328928947449, "logits/rejected": -0.16218364238739014, "logps/chosen": -4.473959922790527, "logps/rejected": -4.889747619628906, "loss": 0.0529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.473959922790527, "rewards/margins": 0.4157875180244446, "rewards/rejected": -4.889747619628906, "sft_loss": 4.134532451629639, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 0.5300640696357395, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.2527707815170288, "logits/rejected": -0.23362918198108673, "logps/chosen": -4.4117431640625, "logps/rejected": -4.904858589172363, "loss": 0.0526, "rewards/accuracies": 0.65625, "rewards/chosen": -4.4117431640625, "rewards/margins": 0.4931156635284424, "rewards/rejected": -4.904858589172363, "sft_loss": 4.160271644592285, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 0.4918512380012406, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.19767886400222778, "logits/rejected": -0.16038745641708374, "logps/chosen": -4.483613967895508, "logps/rejected": -5.078944206237793, "loss": 0.0518, "rewards/accuracies": 0.65625, "rewards/chosen": -4.483613967895508, "rewards/margins": 0.5953308343887329, "rewards/rejected": -5.078944206237793, "sft_loss": 4.205395698547363, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 0.6306090411080313, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.279359370470047, "logits/rejected": -0.24908506870269775, "logps/chosen": -4.543668270111084, "logps/rejected": -5.088433742523193, "loss": 0.0528, "rewards/accuracies": 0.65625, "rewards/chosen": -4.543668270111084, "rewards/margins": 0.5447657108306885, "rewards/rejected": -5.088433742523193, "sft_loss": 4.243107795715332, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 0.47948094032983946, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.2636973261833191, "logits/rejected": -0.2998460531234741, "logps/chosen": -4.743630409240723, "logps/rejected": -5.1253662109375, "loss": 0.0531, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.743630409240723, "rewards/margins": 0.3817363381385803, "rewards/rejected": -5.1253662109375, "sft_loss": 4.456059455871582, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 0.5996058550320081, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.3166981637477875, "logits/rejected": -0.20749597251415253, "logps/chosen": -4.545644283294678, "logps/rejected": -5.04119348526001, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.545644283294678, "rewards/margins": 0.4955490529537201, "rewards/rejected": -5.04119348526001, "sft_loss": 4.291190147399902, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 0.5220798479877709, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.4534633159637451, "logits/rejected": -0.2128814458847046, "logps/chosen": -4.273514747619629, "logps/rejected": -4.954968452453613, "loss": 0.0505, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.273514747619629, "rewards/margins": 0.6814538240432739, "rewards/rejected": -4.954968452453613, "sft_loss": 4.040926933288574, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 0.7702132976372609, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.35295426845550537, "logits/rejected": -0.16922760009765625, "logps/chosen": -4.291215419769287, "logps/rejected": -5.033682346343994, "loss": 0.0505, "rewards/accuracies": 0.6875, "rewards/chosen": -4.291215419769287, "rewards/margins": 0.7424668073654175, "rewards/rejected": -5.033682346343994, "sft_loss": 3.9390976428985596, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 0.721486911740481, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.4789690375328064, "logits/rejected": -0.23137912154197693, "logps/chosen": -4.274365425109863, "logps/rejected": -4.961302757263184, "loss": 0.0501, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.274365425109863, "rewards/margins": 0.6869370341300964, "rewards/rejected": -4.961302757263184, "sft_loss": 3.922863483428955, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 0.4295817185964868, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.4171268045902252, "logits/rejected": -0.1910628080368042, "logps/chosen": -4.448271751403809, "logps/rejected": -5.207947731018066, "loss": 0.0508, "rewards/accuracies": 0.6875, "rewards/chosen": -4.448271751403809, "rewards/margins": 0.7596766352653503, "rewards/rejected": -5.207947731018066, "sft_loss": 4.162169456481934, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 0.5017782636688317, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.5226677060127258, "logits/rejected": -0.3166903853416443, "logps/chosen": -4.627984046936035, "logps/rejected": -5.201998710632324, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.627984046936035, "rewards/margins": 0.5740151405334473, "rewards/rejected": -5.201998710632324, "sft_loss": 4.343914985656738, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 0.622045784701378, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.43745899200439453, "logits/rejected": -0.3405126631259918, "logps/chosen": -4.604950904846191, "logps/rejected": -5.118515968322754, "loss": 0.0527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.604950904846191, "rewards/margins": 0.51356440782547, "rewards/rejected": -5.118515968322754, "sft_loss": 4.261171817779541, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 0.46280700860813107, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.3574795722961426, "logits/rejected": -0.2373519241809845, "logps/chosen": -4.473115921020508, "logps/rejected": -5.071488857269287, "loss": 0.0511, "rewards/accuracies": 0.71875, "rewards/chosen": -4.473115921020508, "rewards/margins": 0.5983726978302002, "rewards/rejected": -5.071488857269287, "sft_loss": 4.186740398406982, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 0.5617703312518776, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.3852320611476898, "logits/rejected": -0.23650208115577698, "logps/chosen": -4.551094055175781, "logps/rejected": -5.2193145751953125, "loss": 0.0526, "rewards/accuracies": 0.65625, "rewards/chosen": -4.551094055175781, "rewards/margins": 0.6682202816009521, "rewards/rejected": -5.2193145751953125, "sft_loss": 4.264029026031494, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 0.6630442372036683, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.3888396918773651, "logits/rejected": -0.055044613778591156, "logps/chosen": -4.306995391845703, "logps/rejected": -5.142845153808594, "loss": 0.0496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.306995391845703, "rewards/margins": 0.8358501195907593, "rewards/rejected": -5.142845153808594, "sft_loss": 4.013720512390137, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 0.7976635274894721, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.48275822401046753, "logits/rejected": -0.24777165055274963, "logps/chosen": -4.506577491760254, "logps/rejected": -5.197785377502441, "loss": 0.0509, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.506577491760254, "rewards/margins": 0.6912076473236084, "rewards/rejected": -5.197785377502441, "sft_loss": 4.213435173034668, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 0.38382243279532763, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.2914837896823883, "logits/rejected": -0.262010395526886, "logps/chosen": -4.344498157501221, "logps/rejected": -5.013187408447266, "loss": 0.0513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.344498157501221, "rewards/margins": 0.6686891317367554, "rewards/rejected": -5.013187408447266, "sft_loss": 4.004141807556152, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 0.7023075707396644, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.16346505284309387, "logits/rejected": -0.10149259865283966, "logps/chosen": -4.34529972076416, "logps/rejected": -5.073423862457275, "loss": 0.0507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.34529972076416, "rewards/margins": 0.7281247973442078, "rewards/rejected": -5.073423862457275, "sft_loss": 3.941153049468994, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 0.49306275936732125, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.3227735161781311, "logits/rejected": -0.22728899121284485, "logps/chosen": -4.6322503089904785, "logps/rejected": -5.29410457611084, "loss": 0.0517, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.6322503089904785, "rewards/margins": 0.6618545055389404, "rewards/rejected": -5.29410457611084, "sft_loss": 4.227608680725098, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 1.0035943476967324, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.26348575949668884, "logits/rejected": -0.17716734111309052, "logps/chosen": -4.563479423522949, "logps/rejected": -5.222577095031738, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.563479423522949, "rewards/margins": 0.6590980291366577, "rewards/rejected": -5.222577095031738, "sft_loss": 4.292522430419922, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 0.45450786301419244, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.5200982093811035, "logits/rejected": -0.21942846477031708, "logps/chosen": -4.524590492248535, "logps/rejected": -5.191534996032715, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.524590492248535, "rewards/margins": 0.6669445633888245, "rewards/rejected": -5.191534996032715, "sft_loss": 4.256932258605957, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 0.45066743934900094, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.4577765464782715, "logits/rejected": -0.25274404883384705, "logps/chosen": -4.507070064544678, "logps/rejected": -5.168764591217041, "loss": 0.0516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.507070064544678, "rewards/margins": 0.6616944670677185, "rewards/rejected": -5.168764591217041, "sft_loss": 4.1500983238220215, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 0.7021636941447833, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.4425809979438782, "logits/rejected": -0.24780945479869843, "logps/chosen": -4.2700958251953125, "logps/rejected": -4.92608642578125, "loss": 0.0517, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.2700958251953125, "rewards/margins": 0.655990719795227, "rewards/rejected": -4.92608642578125, "sft_loss": 4.022648811340332, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 0.4364800359106503, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.3534063398838043, "logits/rejected": -0.2112211287021637, "logps/chosen": -4.486388206481934, "logps/rejected": -5.181911468505859, "loss": 0.052, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.486388206481934, "rewards/margins": 0.6955228447914124, "rewards/rejected": -5.181911468505859, "sft_loss": 4.218506813049316, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 0.5266734489894225, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.4793574810028076, "logits/rejected": -0.2572689950466156, "logps/chosen": -4.473581314086914, "logps/rejected": -5.181696891784668, "loss": 0.0511, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.473581314086914, "rewards/margins": 0.7081155180931091, "rewards/rejected": -5.181696891784668, "sft_loss": 4.205438613891602, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 0.6791010230273304, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.3394157290458679, "logits/rejected": -0.1353457123041153, "logps/chosen": -4.384115219116211, "logps/rejected": -5.233609676361084, "loss": 0.0523, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.384115219116211, "rewards/margins": 0.8494939804077148, "rewards/rejected": -5.233609676361084, "sft_loss": 4.137483596801758, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 0.4237329746059985, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.3028372526168823, "logits/rejected": -0.08005464822053909, "logps/chosen": -4.468472003936768, "logps/rejected": -5.0935516357421875, "loss": 0.0521, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.468472003936768, "rewards/margins": 0.6250793933868408, "rewards/rejected": -5.0935516357421875, "sft_loss": 4.173741340637207, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 0.7904664403236191, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.4600791037082672, "logits/rejected": -0.21939058601856232, "logps/chosen": -4.389094352722168, "logps/rejected": -5.0552263259887695, "loss": 0.0524, "rewards/accuracies": 0.71875, "rewards/chosen": -4.389094352722168, "rewards/margins": 0.6661325097084045, "rewards/rejected": -5.0552263259887695, "sft_loss": 4.177814960479736, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 0.5259227542270914, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.365754097700119, "logits/rejected": -0.15738160908222198, "logps/chosen": -4.536040306091309, "logps/rejected": -5.309323310852051, "loss": 0.0513, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.536040306091309, "rewards/margins": 0.773283064365387, "rewards/rejected": -5.309323310852051, "sft_loss": 4.273083686828613, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 0.44323002580409565, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.3308809697628021, "logits/rejected": -0.21239659190177917, "logps/chosen": -4.450998783111572, "logps/rejected": -5.074978351593018, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.450998783111572, "rewards/margins": 0.6239796280860901, "rewards/rejected": -5.074978351593018, "sft_loss": 4.151812553405762, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 0.47793631642078366, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.34116891026496887, "logits/rejected": -0.3185744881629944, "logps/chosen": -4.456019401550293, "logps/rejected": -5.188083648681641, "loss": 0.0504, "rewards/accuracies": 0.71875, "rewards/chosen": -4.456019401550293, "rewards/margins": 0.732064425945282, "rewards/rejected": -5.188083648681641, "sft_loss": 4.120429039001465, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 0.579752216710794, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.5160247087478638, "logits/rejected": -0.3117043375968933, "logps/chosen": -4.367626190185547, "logps/rejected": -4.930412292480469, "loss": 0.0521, "rewards/accuracies": 0.6875, "rewards/chosen": -4.367626190185547, "rewards/margins": 0.5627862215042114, "rewards/rejected": -4.930412292480469, "sft_loss": 4.12152624130249, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 0.6163130040907793, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.38553833961486816, "logits/rejected": -0.23147761821746826, "logps/chosen": -4.212340354919434, "logps/rejected": -4.871836185455322, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.212340354919434, "rewards/margins": 0.6594957709312439, "rewards/rejected": -4.871836185455322, "sft_loss": 3.979743480682373, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.16218583285808563, "eval_logits/rejected": 0.2726520597934723, "eval_logps/chosen": -4.613275527954102, "eval_logps/rejected": -5.277085304260254, "eval_loss": 0.050447020679712296, "eval_rewards/accuracies": 0.6646884083747864, "eval_rewards/chosen": -4.613275527954102, "eval_rewards/margins": 0.6638097763061523, "eval_rewards/rejected": -5.277085304260254, "eval_runtime": 46.8637, "eval_samples_per_second": 28.7, "eval_sft_loss": 4.179710388183594, "eval_steps_per_second": 7.191, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 0.8125972917328307, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.30984312295913696, "logits/rejected": -0.16324174404144287, "logps/chosen": -4.691993713378906, "logps/rejected": -5.421947479248047, "loss": 0.0519, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.691993713378906, "rewards/margins": 0.7299537062644958, "rewards/rejected": -5.421947479248047, "sft_loss": 4.399287223815918, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 0.6208444104515567, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.3938482403755188, "logits/rejected": -0.18808431923389435, "logps/chosen": -4.691821098327637, "logps/rejected": -5.170048236846924, "loss": 0.0509, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.691821098327637, "rewards/margins": 0.47822675108909607, "rewards/rejected": -5.170048236846924, "sft_loss": 4.298820972442627, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 0.44700685429264475, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.23651990294456482, "logits/rejected": -0.11198411136865616, "logps/chosen": -4.394803524017334, "logps/rejected": -5.207266330718994, "loss": 0.0508, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.394803524017334, "rewards/margins": 0.8124624490737915, "rewards/rejected": -5.207266330718994, "sft_loss": 4.091854095458984, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 0.6111476651853875, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.31173262000083923, "logits/rejected": -0.18862289190292358, "logps/chosen": -4.262498378753662, "logps/rejected": -4.927305698394775, "loss": 0.0511, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.262498378753662, "rewards/margins": 0.6648072004318237, "rewards/rejected": -4.927305698394775, "sft_loss": 3.939523220062256, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 1.236864464715216, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.2820888161659241, "logits/rejected": -0.20513828098773956, "logps/chosen": -4.186476707458496, "logps/rejected": -4.759819984436035, "loss": 0.0534, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.186476707458496, "rewards/margins": 0.5733426809310913, "rewards/rejected": -4.759819984436035, "sft_loss": 3.976614475250244, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 0.43286785735508476, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.32675403356552124, "logits/rejected": -0.23593978583812714, "logps/chosen": -4.594229698181152, "logps/rejected": -5.107740879058838, "loss": 0.0532, "rewards/accuracies": 0.65625, "rewards/chosen": -4.594229698181152, "rewards/margins": 0.5135103464126587, "rewards/rejected": -5.107740879058838, "sft_loss": 4.226545810699463, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 0.3582759153640644, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.4037497043609619, "logits/rejected": -0.2016705572605133, "logps/chosen": -4.662694454193115, "logps/rejected": -5.2524003982543945, "loss": 0.0525, "rewards/accuracies": 0.6875, "rewards/chosen": -4.662694454193115, "rewards/margins": 0.5897052884101868, "rewards/rejected": -5.2524003982543945, "sft_loss": 4.4913787841796875, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 0.4573925711303691, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.3449918329715729, "logits/rejected": -0.1870274394750595, "logps/chosen": -4.6676716804504395, "logps/rejected": -5.286660194396973, "loss": 0.0525, "rewards/accuracies": 0.65625, "rewards/chosen": -4.6676716804504395, "rewards/margins": 0.6189885139465332, "rewards/rejected": -5.286660194396973, "sft_loss": 4.378539085388184, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 0.48802271760572685, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.39901071786880493, "logits/rejected": -0.22823762893676758, "logps/chosen": -4.416820526123047, "logps/rejected": -5.092315673828125, "loss": 0.0515, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.416820526123047, "rewards/margins": 0.6754951477050781, "rewards/rejected": -5.092315673828125, "sft_loss": 4.0865936279296875, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 0.508227844853999, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.3008289933204651, "logits/rejected": -0.10734491050243378, "logps/chosen": -4.298119068145752, "logps/rejected": -4.975314617156982, "loss": 0.051, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.298119068145752, "rewards/margins": 0.6771960854530334, "rewards/rejected": -4.975314617156982, "sft_loss": 4.031597137451172, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 0.44280748250592333, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.3537518382072449, "logits/rejected": 0.0016300469869747758, "logps/chosen": -4.296114444732666, "logps/rejected": -5.029053211212158, "loss": 0.0505, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.296114444732666, "rewards/margins": 0.7329393029212952, "rewards/rejected": -5.029053211212158, "sft_loss": 4.010401725769043, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 0.5994814714419845, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.14458681643009186, "logits/rejected": -0.024362895637750626, "logps/chosen": -4.518597602844238, "logps/rejected": -5.020750999450684, "loss": 0.0524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.518597602844238, "rewards/margins": 0.502153217792511, "rewards/rejected": -5.020750999450684, "sft_loss": 4.172475814819336, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 0.4992944547596079, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.1841050386428833, "logits/rejected": -0.08634983003139496, "logps/chosen": -4.33657169342041, "logps/rejected": -5.232463836669922, "loss": 0.0494, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.33657169342041, "rewards/margins": 0.8958921432495117, "rewards/rejected": -5.232463836669922, "sft_loss": 3.983189344406128, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 0.5969208799043295, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.3868961036205292, "logits/rejected": -0.24560889601707458, "logps/chosen": -4.6813249588012695, "logps/rejected": -5.233216285705566, "loss": 0.053, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.6813249588012695, "rewards/margins": 0.5518918037414551, "rewards/rejected": -5.233216285705566, "sft_loss": 4.4259934425354, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 1.0979867390491649, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.16251157224178314, "logits/rejected": -0.1308099925518036, "logps/chosen": -4.580937385559082, "logps/rejected": -5.119907855987549, "loss": 0.0518, "rewards/accuracies": 0.6875, "rewards/chosen": -4.580937385559082, "rewards/margins": 0.5389704704284668, "rewards/rejected": -5.119907855987549, "sft_loss": 4.249577045440674, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 0.7299460616177387, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.4401113986968994, "logits/rejected": -0.20187684893608093, "logps/chosen": -4.588586330413818, "logps/rejected": -5.211333751678467, "loss": 0.052, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.588586330413818, "rewards/margins": 0.6227480173110962, "rewards/rejected": -5.211333751678467, "sft_loss": 4.332615852355957, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 0.6580906178724784, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.22082960605621338, "logits/rejected": -0.05857197567820549, "logps/chosen": -4.458443641662598, "logps/rejected": -5.258761882781982, "loss": 0.0508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.458443641662598, "rewards/margins": 0.8003188967704773, "rewards/rejected": -5.258761882781982, "sft_loss": 4.194399833679199, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 0.44049659030132926, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.3119185268878937, "logits/rejected": -0.24192802608013153, "logps/chosen": -4.311916828155518, "logps/rejected": -4.910783290863037, "loss": 0.0523, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.311916828155518, "rewards/margins": 0.5988671183586121, "rewards/rejected": -4.910783290863037, "sft_loss": 4.0275068283081055, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 0.7358415647474225, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.2163584679365158, "logits/rejected": -0.20647704601287842, "logps/chosen": -4.386069297790527, "logps/rejected": -4.962353706359863, "loss": 0.0538, "rewards/accuracies": 0.6875, "rewards/chosen": -4.386069297790527, "rewards/margins": 0.5762845277786255, "rewards/rejected": -4.962353706359863, "sft_loss": 4.141005039215088, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 0.7678855187302202, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.28678449988365173, "logits/rejected": -0.059916090220212936, "logps/chosen": -4.479846477508545, "logps/rejected": -5.117179870605469, "loss": 0.0526, "rewards/accuracies": 0.625, "rewards/chosen": -4.479846477508545, "rewards/margins": 0.6373331546783447, "rewards/rejected": -5.117179870605469, "sft_loss": 4.278372287750244, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 0.38028014441452435, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.35091131925582886, "logits/rejected": -0.14311781525611877, "logps/chosen": -4.657089710235596, "logps/rejected": -5.283184051513672, "loss": 0.0513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.657089710235596, "rewards/margins": 0.6260942220687866, "rewards/rejected": -5.283184051513672, "sft_loss": 4.375996112823486, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 0.4178525242758897, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.3786674439907074, "logits/rejected": -0.20341435074806213, "logps/chosen": -4.496813774108887, "logps/rejected": -5.082845687866211, "loss": 0.0512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.496813774108887, "rewards/margins": 0.5860317945480347, "rewards/rejected": -5.082845687866211, "sft_loss": 4.184346675872803, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 0.5131854014657652, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.2360183298587799, "logits/rejected": -0.15669827163219452, "logps/chosen": -4.303060531616211, "logps/rejected": -4.955399990081787, "loss": 0.0515, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.303060531616211, "rewards/margins": 0.6523396968841553, "rewards/rejected": -4.955399990081787, "sft_loss": 4.033090114593506, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 1.1599307981261955, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.2967987358570099, "logits/rejected": -0.22975853085517883, "logps/chosen": -4.2588067054748535, "logps/rejected": -4.761368274688721, "loss": 0.0527, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.2588067054748535, "rewards/margins": 0.5025621652603149, "rewards/rejected": -4.761368274688721, "sft_loss": 3.913691997528076, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 0.41847911237686436, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.2883647382259369, "logits/rejected": -0.22551052272319794, "logps/chosen": -4.588094234466553, "logps/rejected": -5.287965774536133, "loss": 0.0523, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.588094234466553, "rewards/margins": 0.6998715400695801, "rewards/rejected": -5.287965774536133, "sft_loss": 4.375986099243164, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 0.7019822923715036, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.34520524740219116, "logits/rejected": -0.28489288687705994, "logps/chosen": -4.859744071960449, "logps/rejected": -5.2669358253479, "loss": 0.0535, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.859744071960449, "rewards/margins": 0.40719157457351685, "rewards/rejected": -5.2669358253479, "sft_loss": 4.574738025665283, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 0.4557018145797472, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.4234161376953125, "logits/rejected": -0.2507956922054291, "logps/chosen": -4.543206214904785, "logps/rejected": -5.178067207336426, "loss": 0.0522, "rewards/accuracies": 0.6875, "rewards/chosen": -4.543206214904785, "rewards/margins": 0.6348603367805481, "rewards/rejected": -5.178067207336426, "sft_loss": 4.318524360656738, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 0.4933166093135709, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.3509432375431061, "logits/rejected": -0.1096065416932106, "logps/chosen": -4.508671760559082, "logps/rejected": -4.966963768005371, "loss": 0.0515, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.508671760559082, "rewards/margins": 0.4582923352718353, "rewards/rejected": -4.966963768005371, "sft_loss": 4.152871608734131, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 0.4803004498103421, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.36170434951782227, "logits/rejected": -0.10494896024465561, "logps/chosen": -4.229575157165527, "logps/rejected": -5.020062446594238, "loss": 0.0509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.229575157165527, "rewards/margins": 0.7904866933822632, "rewards/rejected": -5.020062446594238, "sft_loss": 4.025331497192383, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 0.5985182635532804, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.405205100774765, "logits/rejected": -0.22326946258544922, "logps/chosen": -4.403514862060547, "logps/rejected": -4.895939826965332, "loss": 0.0526, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.403514862060547, "rewards/margins": 0.4924253523349762, "rewards/rejected": -4.895939826965332, "sft_loss": 4.0891618728637695, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 0.6600807629722721, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.3793318569660187, "logits/rejected": -0.21826522052288055, "logps/chosen": -4.522066116333008, "logps/rejected": -5.230228900909424, "loss": 0.0522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.522066116333008, "rewards/margins": 0.7081626653671265, "rewards/rejected": -5.230228900909424, "sft_loss": 4.325783729553223, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 0.5657421936672914, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.40977898240089417, "logits/rejected": -0.2763887345790863, "logps/chosen": -4.552574157714844, "logps/rejected": -5.204157829284668, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.552574157714844, "rewards/margins": 0.6515840291976929, "rewards/rejected": -5.204157829284668, "sft_loss": 4.2583231925964355, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 0.4706038058812201, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.43373212218284607, "logits/rejected": -0.2594808042049408, "logps/chosen": -4.4295654296875, "logps/rejected": -5.267152786254883, "loss": 0.0511, "rewards/accuracies": 0.6875, "rewards/chosen": -4.4295654296875, "rewards/margins": 0.8375871777534485, "rewards/rejected": -5.267152786254883, "sft_loss": 4.168534278869629, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 0.44872008369044214, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.36206430196762085, "logits/rejected": -0.28401902318000793, "logps/chosen": -4.553805828094482, "logps/rejected": -5.051944255828857, "loss": 0.0513, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.553805828094482, "rewards/margins": 0.4981384873390198, "rewards/rejected": -5.051944255828857, "sft_loss": 4.1244049072265625, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 0.4122542208426088, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.46242958307266235, "logits/rejected": -0.24346823990345, "logps/chosen": -4.514195919036865, "logps/rejected": -5.153885841369629, "loss": 0.0517, "rewards/accuracies": 0.71875, "rewards/chosen": -4.514195919036865, "rewards/margins": 0.6396892070770264, "rewards/rejected": -5.153885841369629, "sft_loss": 4.257824897766113, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 0.5988318190800053, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.387522429227829, "logits/rejected": -0.3059254288673401, "logps/chosen": -4.457314968109131, "logps/rejected": -4.824244499206543, "loss": 0.0538, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.457314968109131, "rewards/margins": 0.3669296205043793, "rewards/rejected": -4.824244499206543, "sft_loss": 4.090696811676025, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 0.4148136422508877, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.2866331934928894, "logits/rejected": -0.07610544562339783, "logps/chosen": -4.626020431518555, "logps/rejected": -5.238821506500244, "loss": 0.0509, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.626020431518555, "rewards/margins": 0.6128014922142029, "rewards/rejected": -5.238821506500244, "sft_loss": 4.2085371017456055, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 0.567187780871907, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.36029210686683655, "logits/rejected": -0.22232429683208466, "logps/chosen": -4.536396503448486, "logps/rejected": -5.154208183288574, "loss": 0.0519, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.536396503448486, "rewards/margins": 0.6178122758865356, "rewards/rejected": -5.154208183288574, "sft_loss": 4.228185653686523, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 0.4092259618625428, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.41812753677368164, "logits/rejected": -0.36150071024894714, "logps/chosen": -4.364471435546875, "logps/rejected": -4.96616792678833, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.364471435546875, "rewards/margins": 0.6016958951950073, "rewards/rejected": -4.96616792678833, "sft_loss": 4.140119552612305, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 0.6595402733474404, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.2546065151691437, "logits/rejected": -0.09192848950624466, "logps/chosen": -4.478928565979004, "logps/rejected": -4.960371494293213, "loss": 0.0517, "rewards/accuracies": 0.65625, "rewards/chosen": -4.478928565979004, "rewards/margins": 0.48144254088401794, "rewards/rejected": -4.960371494293213, "sft_loss": 4.136466026306152, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 0.5332527476480814, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.17541834712028503, "logits/rejected": -0.09706973284482956, "logps/chosen": -4.556529998779297, "logps/rejected": -5.159417629241943, "loss": 0.0528, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.556529998779297, "rewards/margins": 0.6028882265090942, "rewards/rejected": -5.159417629241943, "sft_loss": 4.306204319000244, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 0.4498464591100737, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.16866278648376465, "logits/rejected": -0.141506165266037, "logps/chosen": -4.507957935333252, "logps/rejected": -5.1205620765686035, "loss": 0.0528, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.507957935333252, "rewards/margins": 0.6126040816307068, "rewards/rejected": -5.1205620765686035, "sft_loss": 4.23211145401001, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 0.8215085878156593, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.35125938057899475, "logits/rejected": -0.16566213965415955, "logps/chosen": -4.5039963722229, "logps/rejected": -5.095160007476807, "loss": 0.0533, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.5039963722229, "rewards/margins": 0.5911641716957092, "rewards/rejected": -5.095160007476807, "sft_loss": 4.220867156982422, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 0.6073875924204214, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.27543169260025024, "logits/rejected": -0.3689972162246704, "logps/chosen": -4.526713848114014, "logps/rejected": -5.047951698303223, "loss": 0.0536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.526713848114014, "rewards/margins": 0.5212381482124329, "rewards/rejected": -5.047951698303223, "sft_loss": 4.3203959465026855, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 0.3938937307686175, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.4955802857875824, "logits/rejected": -0.3202812373638153, "logps/chosen": -4.622469425201416, "logps/rejected": -5.246644973754883, "loss": 0.0514, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.622469425201416, "rewards/margins": 0.6241754293441772, "rewards/rejected": -5.246644973754883, "sft_loss": 4.328534126281738, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 0.5272944726316406, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.5755268931388855, "logits/rejected": -0.33584824204444885, "logps/chosen": -4.600456714630127, "logps/rejected": -5.08190393447876, "loss": 0.0528, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.600456714630127, "rewards/margins": 0.4814472794532776, "rewards/rejected": -5.08190393447876, "sft_loss": 4.373833179473877, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 0.4155006152783869, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.551335334777832, "logits/rejected": -0.4132702350616455, "logps/chosen": -4.421041011810303, "logps/rejected": -4.975304126739502, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.421041011810303, "rewards/margins": 0.554263174533844, "rewards/rejected": -4.975304126739502, "sft_loss": 4.110798358917236, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 0.6515975463345118, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.401276171207428, "logits/rejected": -0.25514617562294006, "logps/chosen": -4.445773124694824, "logps/rejected": -4.844512462615967, "loss": 0.0539, "rewards/accuracies": 0.6875, "rewards/chosen": -4.445773124694824, "rewards/margins": 0.398739218711853, "rewards/rejected": -4.844512462615967, "sft_loss": 4.16897439956665, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 0.3446000707611468, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.43346720933914185, "logits/rejected": -0.35360580682754517, "logps/chosen": -4.3459696769714355, "logps/rejected": -5.048262596130371, "loss": 0.0513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.3459696769714355, "rewards/margins": 0.7022929191589355, "rewards/rejected": -5.048262596130371, "sft_loss": 4.152453422546387, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 0.3217984812990353, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.4467714726924896, "logits/rejected": -0.2542513608932495, "logps/chosen": -4.547066688537598, "logps/rejected": -5.322503566741943, "loss": 0.0506, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.547066688537598, "rewards/margins": 0.7754372358322144, "rewards/rejected": -5.322503566741943, "sft_loss": 4.235215187072754, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 0.6050168275087299, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.3104148209095001, "logits/rejected": -0.10100110620260239, "logps/chosen": -4.59012508392334, "logps/rejected": -5.028843879699707, "loss": 0.0522, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.59012508392334, "rewards/margins": 0.4387180209159851, "rewards/rejected": -5.028843879699707, "sft_loss": 4.259760856628418, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 0.5526542417463417, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.27798131108283997, "logits/rejected": -0.16789737343788147, "logps/chosen": -4.456320762634277, "logps/rejected": -5.029903411865234, "loss": 0.0524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.456320762634277, "rewards/margins": 0.5735821723937988, "rewards/rejected": -5.029903411865234, "sft_loss": 4.188068389892578, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 0.6725358519055418, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.43549829721450806, "logits/rejected": -0.23555438220500946, "logps/chosen": -4.349188804626465, "logps/rejected": -5.042940139770508, "loss": 0.0517, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.349188804626465, "rewards/margins": 0.6937510371208191, "rewards/rejected": -5.042940139770508, "sft_loss": 4.0453596115112305, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 0.47214900467098986, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.2399250715970993, "logits/rejected": -0.09774953126907349, "logps/chosen": -4.491705417633057, "logps/rejected": -5.064525604248047, "loss": 0.051, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.491705417633057, "rewards/margins": 0.5728203654289246, "rewards/rejected": -5.064525604248047, "sft_loss": 4.136842727661133, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 0.4972109814218113, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.3452068865299225, "logits/rejected": -0.15056590735912323, "logps/chosen": -4.566656589508057, "logps/rejected": -5.146246433258057, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.566656589508057, "rewards/margins": 0.5795894861221313, "rewards/rejected": -5.146246433258057, "sft_loss": 4.118401050567627, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 0.6882142186528432, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.367424339056015, "logits/rejected": -0.16041013598442078, "logps/chosen": -4.63809061050415, "logps/rejected": -5.268001079559326, "loss": 0.052, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.63809061050415, "rewards/margins": 0.6299105286598206, "rewards/rejected": -5.268001079559326, "sft_loss": 4.292515754699707, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 0.48308726153623693, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.40587466955184937, "logits/rejected": -0.11217250674962997, "logps/chosen": -4.291191577911377, "logps/rejected": -5.193698883056641, "loss": 0.0491, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.291191577911377, "rewards/margins": 0.9025076627731323, "rewards/rejected": -5.193698883056641, "sft_loss": 4.034631729125977, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 0.5935174412481173, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.4540501534938812, "logits/rejected": -0.1173338070511818, "logps/chosen": -4.344797611236572, "logps/rejected": -5.267854690551758, "loss": 0.0505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.344797611236572, "rewards/margins": 0.9230567216873169, "rewards/rejected": -5.267854690551758, "sft_loss": 3.9546589851379395, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 0.6750078977580464, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.19961309432983398, "logits/rejected": -0.2153279036283493, "logps/chosen": -4.354008674621582, "logps/rejected": -4.975162506103516, "loss": 0.0528, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.354008674621582, "rewards/margins": 0.6211541891098022, "rewards/rejected": -4.975162506103516, "sft_loss": 4.119563102722168, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 0.43839706775658527, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.35946568846702576, "logits/rejected": -0.12679320573806763, "logps/chosen": -4.462148189544678, "logps/rejected": -5.072259902954102, "loss": 0.0532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.462148189544678, "rewards/margins": 0.6101123690605164, "rewards/rejected": -5.072259902954102, "sft_loss": 4.199645519256592, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 0.5240237174493583, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.5560758709907532, "logits/rejected": -0.2003978192806244, "logps/chosen": -4.430878639221191, "logps/rejected": -5.3489251136779785, "loss": 0.0511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.430878639221191, "rewards/margins": 0.9180465936660767, "rewards/rejected": -5.3489251136779785, "sft_loss": 4.257626533508301, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 0.34362372364773286, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.37839382886886597, "logits/rejected": -0.4028445780277252, "logps/chosen": -4.604657173156738, "logps/rejected": -5.0622429847717285, "loss": 0.0518, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.604657173156738, "rewards/margins": 0.45758503675460815, "rewards/rejected": -5.0622429847717285, "sft_loss": 4.296733856201172, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 0.39810904002106345, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.5207768082618713, "logits/rejected": -0.3638109266757965, "logps/chosen": -4.3686041831970215, "logps/rejected": -5.053372383117676, "loss": 0.0505, "rewards/accuracies": 0.75, "rewards/chosen": -4.3686041831970215, "rewards/margins": 0.684768795967102, "rewards/rejected": -5.053372383117676, "sft_loss": 4.103168964385986, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 0.4173210251113381, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.3816406726837158, "logits/rejected": -0.2449624091386795, "logps/chosen": -4.267149448394775, "logps/rejected": -4.792271614074707, "loss": 0.0516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.267149448394775, "rewards/margins": 0.525122344493866, "rewards/rejected": -4.792271614074707, "sft_loss": 3.9861807823181152, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 0.5206031699456903, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.3659888505935669, "logits/rejected": -0.22819384932518005, "logps/chosen": -4.522797584533691, "logps/rejected": -5.074974060058594, "loss": 0.0523, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.522797584533691, "rewards/margins": 0.5521765351295471, "rewards/rejected": -5.074974060058594, "sft_loss": 4.2375807762146, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 0.5951169620080391, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.32757097482681274, "logits/rejected": -0.1094558984041214, "logps/chosen": -4.466285705566406, "logps/rejected": -5.261303424835205, "loss": 0.0501, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.466285705566406, "rewards/margins": 0.7950171232223511, "rewards/rejected": -5.261303424835205, "sft_loss": 4.161604881286621, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 0.5900488732663051, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.33634185791015625, "logits/rejected": -0.11354222148656845, "logps/chosen": -4.338430881500244, "logps/rejected": -5.108603000640869, "loss": 0.0502, "rewards/accuracies": 0.71875, "rewards/chosen": -4.338430881500244, "rewards/margins": 0.7701722383499146, "rewards/rejected": -5.108603000640869, "sft_loss": 4.010077953338623, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 0.43817436520379927, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.26562008261680603, "logits/rejected": -0.16547879576683044, "logps/chosen": -4.503973484039307, "logps/rejected": -5.440014839172363, "loss": 0.0501, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.503973484039307, "rewards/margins": 0.9360410571098328, "rewards/rejected": -5.440014839172363, "sft_loss": 4.1721086502075195, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 0.4595898472433441, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.3069925606250763, "logits/rejected": -0.14767572283744812, "logps/chosen": -4.208481311798096, "logps/rejected": -4.952236652374268, "loss": 0.0507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.208481311798096, "rewards/margins": 0.7437552213668823, "rewards/rejected": -4.952236652374268, "sft_loss": 3.812131404876709, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 0.5305346877025717, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.1933150589466095, "logits/rejected": -0.0845588892698288, "logps/chosen": -4.497615814208984, "logps/rejected": -5.3453803062438965, "loss": 0.0495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.497615814208984, "rewards/margins": 0.8477641344070435, "rewards/rejected": -5.3453803062438965, "sft_loss": 4.077464580535889, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 0.6410263342308636, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.276964008808136, "logits/rejected": -0.16606521606445312, "logps/chosen": -4.369429588317871, "logps/rejected": -5.116156578063965, "loss": 0.0511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.369429588317871, "rewards/margins": 0.7467272281646729, "rewards/rejected": -5.116156578063965, "sft_loss": 4.034445762634277, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 0.44830176640192815, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.4733208119869232, "logits/rejected": -0.18805237114429474, "logps/chosen": -4.483094692230225, "logps/rejected": -5.300882339477539, "loss": 0.0518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.483094692230225, "rewards/margins": 0.8177868127822876, "rewards/rejected": -5.300882339477539, "sft_loss": 4.2453718185424805, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 0.6409870632514963, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.314197838306427, "logits/rejected": -0.17623493075370789, "logps/chosen": -4.647237777709961, "logps/rejected": -5.2594404220581055, "loss": 0.0522, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.647237777709961, "rewards/margins": 0.612203061580658, "rewards/rejected": -5.2594404220581055, "sft_loss": 4.35164737701416, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 0.3504734308346918, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.5192325115203857, "logits/rejected": -0.46321648359298706, "logps/chosen": -4.322511672973633, "logps/rejected": -4.9438066482543945, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.322511672973633, "rewards/margins": 0.6212958097457886, "rewards/rejected": -4.9438066482543945, "sft_loss": 4.152332305908203, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 0.5039716273254905, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.42465925216674805, "logits/rejected": -0.3103640377521515, "logps/chosen": -4.393340110778809, "logps/rejected": -5.021480560302734, "loss": 0.0531, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.393340110778809, "rewards/margins": 0.6281408071517944, "rewards/rejected": -5.021480560302734, "sft_loss": 4.2171502113342285, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 0.32450018350172033, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.39585283398628235, "logits/rejected": -0.3093074560165405, "logps/chosen": -4.389555931091309, "logps/rejected": -4.971273899078369, "loss": 0.052, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.389555931091309, "rewards/margins": 0.5817176103591919, "rewards/rejected": -4.971273899078369, "sft_loss": 4.138664245605469, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 0.4160051311346219, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.48266953229904175, "logits/rejected": -0.3032703101634979, "logps/chosen": -4.55446720123291, "logps/rejected": -5.380070209503174, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.55446720123291, "rewards/margins": 0.8256031274795532, "rewards/rejected": -5.380070209503174, "sft_loss": 4.303145408630371, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 0.7984546744437679, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.34332793951034546, "logits/rejected": -0.4161340594291687, "logps/chosen": -4.649590492248535, "logps/rejected": -5.079288482666016, "loss": 0.0541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.649590492248535, "rewards/margins": 0.42969760298728943, "rewards/rejected": -5.079288482666016, "sft_loss": 4.418673515319824, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 0.522883901950886, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.34629374742507935, "logits/rejected": -0.37964576482772827, "logps/chosen": -4.412274360656738, "logps/rejected": -4.9246368408203125, "loss": 0.0531, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.412274360656738, "rewards/margins": 0.5123627781867981, "rewards/rejected": -4.9246368408203125, "sft_loss": 4.1936140060424805, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 0.46071677852464993, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.4360761046409607, "logits/rejected": -0.4055995047092438, "logps/chosen": -4.450671195983887, "logps/rejected": -5.006396293640137, "loss": 0.0515, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.450671195983887, "rewards/margins": 0.5557257533073425, "rewards/rejected": -5.006396293640137, "sft_loss": 4.149048805236816, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.09837622940540314, "eval_logits/rejected": 0.20496821403503418, "eval_logps/chosen": -4.444206237792969, "eval_logps/rejected": -5.07861852645874, "eval_loss": 0.05041274055838585, "eval_rewards/accuracies": 0.6824925541877747, "eval_rewards/chosen": -4.444206237792969, "eval_rewards/margins": 0.6344121694564819, "eval_rewards/rejected": -5.07861852645874, "eval_runtime": 46.5888, "eval_samples_per_second": 28.87, "eval_sft_loss": 4.093306541442871, "eval_steps_per_second": 7.233, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 0.6572158687306725, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.605633020401001, "logits/rejected": -0.3718469738960266, "logps/chosen": -4.480620384216309, "logps/rejected": -5.0697431564331055, "loss": 0.0519, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.480620384216309, "rewards/margins": 0.5891224145889282, "rewards/rejected": -5.0697431564331055, "sft_loss": 4.248944282531738, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 0.6664663344359709, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.3627350926399231, "logits/rejected": -0.24719932675361633, "logps/chosen": -4.511195182800293, "logps/rejected": -5.193454265594482, "loss": 0.0512, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.511195182800293, "rewards/margins": 0.6822598576545715, "rewards/rejected": -5.193454265594482, "sft_loss": 4.303326606750488, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 0.5120494751105008, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.34684649109840393, "logits/rejected": -0.24992990493774414, "logps/chosen": -4.363380432128906, "logps/rejected": -5.179116249084473, "loss": 0.0505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.363380432128906, "rewards/margins": 0.8157358169555664, "rewards/rejected": -5.179116249084473, "sft_loss": 4.205404758453369, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 0.6874585270699716, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.4058879017829895, "logits/rejected": -0.17549274861812592, "logps/chosen": -4.187855243682861, "logps/rejected": -4.797568321228027, "loss": 0.0525, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.187855243682861, "rewards/margins": 0.6097137928009033, "rewards/rejected": -4.797568321228027, "sft_loss": 4.016551494598389, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 0.6060820378492465, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.4957265257835388, "logits/rejected": -0.2640833258628845, "logps/chosen": -4.30916690826416, "logps/rejected": -5.123547554016113, "loss": 0.0515, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.30916690826416, "rewards/margins": 0.8143804669380188, "rewards/rejected": -5.123547554016113, "sft_loss": 4.097870826721191, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 0.5151875125094709, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.3500848412513733, "logits/rejected": -0.2688957750797272, "logps/chosen": -4.340577602386475, "logps/rejected": -5.0534515380859375, "loss": 0.0489, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.340577602386475, "rewards/margins": 0.7128733396530151, "rewards/rejected": -5.0534515380859375, "sft_loss": 3.984767198562622, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 0.46145166440756263, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.4007895588874817, "logits/rejected": -0.2340659350156784, "logps/chosen": -4.39790678024292, "logps/rejected": -5.234405517578125, "loss": 0.0514, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.39790678024292, "rewards/margins": 0.8364987373352051, "rewards/rejected": -5.234405517578125, "sft_loss": 4.1350483894348145, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 0.5800643184003756, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.38760238885879517, "logits/rejected": -0.33849984407424927, "logps/chosen": -4.464108467102051, "logps/rejected": -5.05782413482666, "loss": 0.0527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.464108467102051, "rewards/margins": 0.5937153100967407, "rewards/rejected": -5.05782413482666, "sft_loss": 4.192983150482178, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 0.489683355986247, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.4335315227508545, "logits/rejected": -0.3097968101501465, "logps/chosen": -4.670001029968262, "logps/rejected": -5.261584281921387, "loss": 0.0528, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.670001029968262, "rewards/margins": 0.5915828347206116, "rewards/rejected": -5.261584281921387, "sft_loss": 4.404904365539551, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 0.7102902511949915, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.45332375168800354, "logits/rejected": -0.36583977937698364, "logps/chosen": -4.492857933044434, "logps/rejected": -5.338987827301025, "loss": 0.0499, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.492857933044434, "rewards/margins": 0.8461304903030396, "rewards/rejected": -5.338987827301025, "sft_loss": 4.102261066436768, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 0.579184068772222, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.5107121467590332, "logits/rejected": -0.27316388487815857, "logps/chosen": -4.263310432434082, "logps/rejected": -4.881662368774414, "loss": 0.0516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.263310432434082, "rewards/margins": 0.6183524131774902, "rewards/rejected": -4.881662368774414, "sft_loss": 4.044577598571777, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 0.42738608993363675, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.5125621557235718, "logits/rejected": -0.2538798749446869, "logps/chosen": -4.523799896240234, "logps/rejected": -5.066987037658691, "loss": 0.0513, "rewards/accuracies": 0.6875, "rewards/chosen": -4.523799896240234, "rewards/margins": 0.5431872010231018, "rewards/rejected": -5.066987037658691, "sft_loss": 4.190533638000488, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 0.681260006283264, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.48108816146850586, "logits/rejected": -0.30100640654563904, "logps/chosen": -4.549483299255371, "logps/rejected": -5.0618977546691895, "loss": 0.0527, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.549483299255371, "rewards/margins": 0.5124139785766602, "rewards/rejected": -5.0618977546691895, "sft_loss": 4.250813007354736, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 0.5920717815446642, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.45581454038619995, "logits/rejected": -0.31429341435432434, "logps/chosen": -4.41359806060791, "logps/rejected": -5.109930515289307, "loss": 0.0505, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.41359806060791, "rewards/margins": 0.6963319778442383, "rewards/rejected": -5.109930515289307, "sft_loss": 4.015780448913574, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 0.6330137390349065, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.3992709219455719, "logits/rejected": -0.28337493538856506, "logps/chosen": -4.330594539642334, "logps/rejected": -4.930819034576416, "loss": 0.0501, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.330594539642334, "rewards/margins": 0.6002241969108582, "rewards/rejected": -4.930819034576416, "sft_loss": 3.878619432449341, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 0.8940253461870524, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.24059781432151794, "logits/rejected": -0.12366914749145508, "logps/chosen": -4.571654796600342, "logps/rejected": -5.292092800140381, "loss": 0.052, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.571654796600342, "rewards/margins": 0.7204381823539734, "rewards/rejected": -5.292092800140381, "sft_loss": 4.188701152801514, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 0.4704393131487359, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.3739601969718933, "logits/rejected": -0.23108768463134766, "logps/chosen": -4.504766464233398, "logps/rejected": -5.312743663787842, "loss": 0.051, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.504766464233398, "rewards/margins": 0.807977557182312, "rewards/rejected": -5.312743663787842, "sft_loss": 4.1565680503845215, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 0.511447960826902, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.3512064814567566, "logits/rejected": -0.1589089035987854, "logps/chosen": -4.425866603851318, "logps/rejected": -5.213771820068359, "loss": 0.0507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.425866603851318, "rewards/margins": 0.7879055738449097, "rewards/rejected": -5.213771820068359, "sft_loss": 4.079559326171875, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 0.5089415103306051, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.38519638776779175, "logits/rejected": -0.18898412585258484, "logps/chosen": -4.655232906341553, "logps/rejected": -5.344581604003906, "loss": 0.0525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.655232906341553, "rewards/margins": 0.6893488168716431, "rewards/rejected": -5.344581604003906, "sft_loss": 4.351537704467773, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 0.5171631219205263, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.30115431547164917, "logits/rejected": -0.20378553867340088, "logps/chosen": -4.3300395011901855, "logps/rejected": -5.086686134338379, "loss": 0.0488, "rewards/accuracies": 0.6875, "rewards/chosen": -4.3300395011901855, "rewards/margins": 0.7566461563110352, "rewards/rejected": -5.086686134338379, "sft_loss": 3.8978729248046875, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 0.7414383342459527, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.2479126900434494, "logits/rejected": -0.26643773913383484, "logps/chosen": -4.493429660797119, "logps/rejected": -5.09151554107666, "loss": 0.0526, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.493429660797119, "rewards/margins": 0.5980857610702515, "rewards/rejected": -5.09151554107666, "sft_loss": 4.203255653381348, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 0.43566242923597975, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.36846572160720825, "logits/rejected": -0.24956099689006805, "logps/chosen": -4.325127601623535, "logps/rejected": -4.973796367645264, "loss": 0.0505, "rewards/accuracies": 0.6875, "rewards/chosen": -4.325127601623535, "rewards/margins": 0.6486689448356628, "rewards/rejected": -4.973796367645264, "sft_loss": 3.9762065410614014, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 0.4748215414635767, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.2849575877189636, "logits/rejected": -0.05325264856219292, "logps/chosen": -4.543246269226074, "logps/rejected": -5.216692924499512, "loss": 0.0515, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.543246269226074, "rewards/margins": 0.6734462976455688, "rewards/rejected": -5.216692924499512, "sft_loss": 4.21115779876709, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 1.3413675309441973, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.3605079650878906, "logits/rejected": -0.22670654952526093, "logps/chosen": -4.357509136199951, "logps/rejected": -5.064937591552734, "loss": 0.0519, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.357509136199951, "rewards/margins": 0.7074285745620728, "rewards/rejected": -5.064937591552734, "sft_loss": 4.133220195770264, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 0.6364875128270325, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.27434051036834717, "logits/rejected": -0.1490970402956009, "logps/chosen": -4.532397747039795, "logps/rejected": -5.171026706695557, "loss": 0.0512, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.532397747039795, "rewards/margins": 0.638629138469696, "rewards/rejected": -5.171026706695557, "sft_loss": 4.182089328765869, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 0.5736099451418228, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.45021653175354004, "logits/rejected": -0.2502950429916382, "logps/chosen": -4.460026741027832, "logps/rejected": -5.434047222137451, "loss": 0.0498, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.460026741027832, "rewards/margins": 0.9740206003189087, "rewards/rejected": -5.434047222137451, "sft_loss": 4.120427131652832, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 0.46758669071726616, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.44528883695602417, "logits/rejected": -0.3131914734840393, "logps/chosen": -4.494815349578857, "logps/rejected": -5.001442909240723, "loss": 0.0533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.494815349578857, "rewards/margins": 0.5066278576850891, "rewards/rejected": -5.001442909240723, "sft_loss": 4.199819564819336, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 0.49191933154206485, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.45389944314956665, "logits/rejected": -0.2852258086204529, "logps/chosen": -4.434971809387207, "logps/rejected": -5.1843976974487305, "loss": 0.0513, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.434971809387207, "rewards/margins": 0.7494255900382996, "rewards/rejected": -5.1843976974487305, "sft_loss": 4.208171844482422, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 0.9037410118950826, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.3565642237663269, "logits/rejected": -0.24279490113258362, "logps/chosen": -4.221377372741699, "logps/rejected": -4.9192304611206055, "loss": 0.0509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.221377372741699, "rewards/margins": 0.6978529691696167, "rewards/rejected": -4.9192304611206055, "sft_loss": 3.895989179611206, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 0.5071819263495008, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.3867717981338501, "logits/rejected": -0.0971466600894928, "logps/chosen": -4.355823516845703, "logps/rejected": -5.244298934936523, "loss": 0.0505, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.355823516845703, "rewards/margins": 0.8884755969047546, "rewards/rejected": -5.244298934936523, "sft_loss": 4.127333164215088, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 0.9240705189349777, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.27731746435165405, "logits/rejected": -0.11383461952209473, "logps/chosen": -4.596616268157959, "logps/rejected": -5.186906814575195, "loss": 0.0518, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.596616268157959, "rewards/margins": 0.590290904045105, "rewards/rejected": -5.186906814575195, "sft_loss": 4.284438610076904, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 0.6753498654428448, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.40660548210144043, "logits/rejected": -0.21776506304740906, "logps/chosen": -4.394054412841797, "logps/rejected": -5.0748724937438965, "loss": 0.0516, "rewards/accuracies": 0.71875, "rewards/chosen": -4.394054412841797, "rewards/margins": 0.6808184385299683, "rewards/rejected": -5.0748724937438965, "sft_loss": 4.122300148010254, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 0.4671400500652704, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.37436243891716003, "logits/rejected": -0.19707582890987396, "logps/chosen": -4.417657375335693, "logps/rejected": -5.068818092346191, "loss": 0.0521, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.417657375335693, "rewards/margins": 0.6511603593826294, "rewards/rejected": -5.068818092346191, "sft_loss": 4.166541576385498, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 0.49230232744365754, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.42794451117515564, "logits/rejected": -0.20655164122581482, "logps/chosen": -4.463809013366699, "logps/rejected": -5.186387062072754, "loss": 0.0506, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.463809013366699, "rewards/margins": 0.7225781679153442, "rewards/rejected": -5.186387062072754, "sft_loss": 4.121039867401123, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 0.8790707326411614, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.33565372228622437, "logits/rejected": -0.13639724254608154, "logps/chosen": -4.411907196044922, "logps/rejected": -5.098509788513184, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.411907196044922, "rewards/margins": 0.6866029500961304, "rewards/rejected": -5.098509788513184, "sft_loss": 4.14210319519043, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 0.4537497662691024, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.3935709595680237, "logits/rejected": -0.16922712326049805, "logps/chosen": -4.428333282470703, "logps/rejected": -5.005343437194824, "loss": 0.052, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.428333282470703, "rewards/margins": 0.5770100355148315, "rewards/rejected": -5.005343437194824, "sft_loss": 4.094406604766846, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 0.49349269120138967, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.4444296360015869, "logits/rejected": -0.17299816012382507, "logps/chosen": -4.411839485168457, "logps/rejected": -5.164982318878174, "loss": 0.051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.411839485168457, "rewards/margins": 0.7531424760818481, "rewards/rejected": -5.164982318878174, "sft_loss": 4.166609764099121, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 0.5341732858975665, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.44806498289108276, "logits/rejected": -0.20182207226753235, "logps/chosen": -4.432277679443359, "logps/rejected": -5.187029838562012, "loss": 0.0513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.432277679443359, "rewards/margins": 0.7547519207000732, "rewards/rejected": -5.187029838562012, "sft_loss": 4.210070610046387, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 0.6009087279793405, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.26394909620285034, "logits/rejected": -0.14600220322608948, "logps/chosen": -4.402077674865723, "logps/rejected": -5.284658908843994, "loss": 0.0494, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.402077674865723, "rewards/margins": 0.8825809359550476, "rewards/rejected": -5.284658908843994, "sft_loss": 4.052593231201172, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 0.49808890555063334, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.4313860535621643, "logits/rejected": -0.18899060785770416, "logps/chosen": -4.493370056152344, "logps/rejected": -5.064239501953125, "loss": 0.0513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.493370056152344, "rewards/margins": 0.5708690881729126, "rewards/rejected": -5.064239501953125, "sft_loss": 4.171633720397949, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 0.8289595837730992, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.5128465294837952, "logits/rejected": -0.2724844217300415, "logps/chosen": -4.469786643981934, "logps/rejected": -4.959803104400635, "loss": 0.0536, "rewards/accuracies": 0.625, "rewards/chosen": -4.469786643981934, "rewards/margins": 0.490016371011734, "rewards/rejected": -4.959803104400635, "sft_loss": 4.14853572845459, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 0.5032451243230273, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.5017607808113098, "logits/rejected": -0.3467402160167694, "logps/chosen": -4.393344879150391, "logps/rejected": -5.1140851974487305, "loss": 0.0535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.393344879150391, "rewards/margins": 0.7207397222518921, "rewards/rejected": -5.1140851974487305, "sft_loss": 4.095994472503662, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 0.4694756225943738, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.2644937336444855, "logits/rejected": -0.187953382730484, "logps/chosen": -4.710282802581787, "logps/rejected": -5.166679382324219, "loss": 0.0528, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.710282802581787, "rewards/margins": 0.45639634132385254, "rewards/rejected": -5.166679382324219, "sft_loss": 4.37774133682251, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 0.357064549599628, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.5153255462646484, "logits/rejected": -0.35782045125961304, "logps/chosen": -4.570986270904541, "logps/rejected": -5.14137601852417, "loss": 0.0526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.570986270904541, "rewards/margins": 0.570389986038208, "rewards/rejected": -5.14137601852417, "sft_loss": 4.399497032165527, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 0.3821581130831781, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.5005272626876831, "logits/rejected": -0.26194092631340027, "logps/chosen": -4.353894233703613, "logps/rejected": -4.978847980499268, "loss": 0.0513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.353894233703613, "rewards/margins": 0.6249544024467468, "rewards/rejected": -4.978847980499268, "sft_loss": 4.077733516693115, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 0.6565617472724057, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.34044063091278076, "logits/rejected": -0.15807709097862244, "logps/chosen": -4.35715389251709, "logps/rejected": -4.989835262298584, "loss": 0.0517, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.35715389251709, "rewards/margins": 0.6326818466186523, "rewards/rejected": -4.989835262298584, "sft_loss": 4.096672534942627, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 0.5764274735030329, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.21918360888957977, "logits/rejected": -0.06928624957799911, "logps/chosen": -4.338745594024658, "logps/rejected": -4.947409152984619, "loss": 0.05, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.338745594024658, "rewards/margins": 0.6086626648902893, "rewards/rejected": -4.947409152984619, "sft_loss": 3.980414628982544, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 0.5353367303320445, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.24074383080005646, "logits/rejected": -0.1598784625530243, "logps/chosen": -4.4777021408081055, "logps/rejected": -5.272585391998291, "loss": 0.0508, "rewards/accuracies": 0.65625, "rewards/chosen": -4.4777021408081055, "rewards/margins": 0.7948837280273438, "rewards/rejected": -5.272585391998291, "sft_loss": 4.107278823852539, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 0.9053716419441293, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.11156382411718369, "logits/rejected": -0.016821326687932014, "logps/chosen": -4.582097053527832, "logps/rejected": -5.265927791595459, "loss": 0.0509, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.582097053527832, "rewards/margins": 0.6838306784629822, "rewards/rejected": -5.265927791595459, "sft_loss": 4.143289566040039, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 0.38002133797992227, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.2311353236436844, "logits/rejected": -0.1018177717924118, "logps/chosen": -4.600157737731934, "logps/rejected": -5.246756076812744, "loss": 0.0507, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.600157737731934, "rewards/margins": 0.6465979814529419, "rewards/rejected": -5.246756076812744, "sft_loss": 4.1230292320251465, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 0.47238484247484197, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.4059460759162903, "logits/rejected": -0.1782340258359909, "logps/chosen": -4.562694549560547, "logps/rejected": -5.263379096984863, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.562694549560547, "rewards/margins": 0.7006848454475403, "rewards/rejected": -5.263379096984863, "sft_loss": 4.330922603607178, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 0.46008827011632475, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.30883657932281494, "logits/rejected": -0.08638667315244675, "logps/chosen": -4.29897403717041, "logps/rejected": -5.054502010345459, "loss": 0.0519, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.29897403717041, "rewards/margins": 0.7555279731750488, "rewards/rejected": -5.054502010345459, "sft_loss": 4.01223611831665, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 0.49637131140727475, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.32126617431640625, "logits/rejected": -0.2086419314146042, "logps/chosen": -4.370186805725098, "logps/rejected": -5.012415409088135, "loss": 0.0524, "rewards/accuracies": 0.65625, "rewards/chosen": -4.370186805725098, "rewards/margins": 0.6422282457351685, "rewards/rejected": -5.012415409088135, "sft_loss": 4.082181453704834, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 0.5200535244364285, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.43776053190231323, "logits/rejected": -0.3125496804714203, "logps/chosen": -4.482710361480713, "logps/rejected": -5.104378700256348, "loss": 0.052, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.482710361480713, "rewards/margins": 0.6216682195663452, "rewards/rejected": -5.104378700256348, "sft_loss": 4.186079025268555, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 0.41934804007219456, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.504966139793396, "logits/rejected": -0.4237605035305023, "logps/chosen": -4.654808044433594, "logps/rejected": -5.295411586761475, "loss": 0.0517, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.654808044433594, "rewards/margins": 0.6406036019325256, "rewards/rejected": -5.295411586761475, "sft_loss": 4.294847011566162, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 0.4787343001068853, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.5634989738464355, "logits/rejected": -0.3325952887535095, "logps/chosen": -4.372393608093262, "logps/rejected": -5.1716814041137695, "loss": 0.0505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.372393608093262, "rewards/margins": 0.7992880344390869, "rewards/rejected": -5.1716814041137695, "sft_loss": 4.124917030334473, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 0.33250809246420404, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.40238314867019653, "logits/rejected": -0.3741799294948578, "logps/chosen": -4.596774101257324, "logps/rejected": -5.119741439819336, "loss": 0.0522, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.596774101257324, "rewards/margins": 0.5229678750038147, "rewards/rejected": -5.119741439819336, "sft_loss": 4.286962509155273, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 0.517760589433127, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.326946496963501, "logits/rejected": -0.2680968642234802, "logps/chosen": -4.346710205078125, "logps/rejected": -4.902353763580322, "loss": 0.0524, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.346710205078125, "rewards/margins": 0.5556432604789734, "rewards/rejected": -4.902353763580322, "sft_loss": 4.1214799880981445, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 0.6963941422892898, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.36718350648880005, "logits/rejected": -0.1814633309841156, "logps/chosen": -4.445253372192383, "logps/rejected": -4.956208229064941, "loss": 0.053, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.445253372192383, "rewards/margins": 0.5109549760818481, "rewards/rejected": -4.956208229064941, "sft_loss": 4.123419284820557, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 0.47932502119566067, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.46029338240623474, "logits/rejected": -0.3233875632286072, "logps/chosen": -4.486559867858887, "logps/rejected": -5.101731300354004, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.486559867858887, "rewards/margins": 0.6151722073554993, "rewards/rejected": -5.101731300354004, "sft_loss": 4.225186347961426, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 0.5705615653143404, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.4992721676826477, "logits/rejected": -0.22667022049427032, "logps/chosen": -4.493367671966553, "logps/rejected": -5.135613441467285, "loss": 0.0526, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.493367671966553, "rewards/margins": 0.6422454118728638, "rewards/rejected": -5.135613441467285, "sft_loss": 4.319209098815918, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 0.4273187134282065, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.3602706789970398, "logits/rejected": -0.19205181300640106, "logps/chosen": -4.5518798828125, "logps/rejected": -5.212324142456055, "loss": 0.0506, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.5518798828125, "rewards/margins": 0.6604443788528442, "rewards/rejected": -5.212324142456055, "sft_loss": 4.110279560089111, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 0.4942906202027163, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.353047639131546, "logits/rejected": -0.21400539577007294, "logps/chosen": -4.460274696350098, "logps/rejected": -5.068617820739746, "loss": 0.0509, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.460274696350098, "rewards/margins": 0.6083430647850037, "rewards/rejected": -5.068617820739746, "sft_loss": 4.063004493713379, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 0.46608384889709426, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.4946712553501129, "logits/rejected": -0.3003917634487152, "logps/chosen": -4.2645416259765625, "logps/rejected": -5.04714298248291, "loss": 0.0511, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.2645416259765625, "rewards/margins": 0.7826014757156372, "rewards/rejected": -5.04714298248291, "sft_loss": 4.0179572105407715, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 0.5031334271882354, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.43839550018310547, "logits/rejected": -0.28657081723213196, "logps/chosen": -4.571595191955566, "logps/rejected": -5.225712776184082, "loss": 0.0515, "rewards/accuracies": 0.6875, "rewards/chosen": -4.571595191955566, "rewards/margins": 0.6541174054145813, "rewards/rejected": -5.225712776184082, "sft_loss": 4.217996597290039, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 0.47945064630778916, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.350267231464386, "logits/rejected": -0.14085647463798523, "logps/chosen": -4.361026287078857, "logps/rejected": -5.13167667388916, "loss": 0.0502, "rewards/accuracies": 0.71875, "rewards/chosen": -4.361026287078857, "rewards/margins": 0.7706495523452759, "rewards/rejected": -5.13167667388916, "sft_loss": 4.0194172859191895, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 0.48807203107233454, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.28777769207954407, "logits/rejected": -0.19730152189731598, "logps/chosen": -4.527838230133057, "logps/rejected": -5.101526737213135, "loss": 0.053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.527838230133057, "rewards/margins": 0.5736882090568542, "rewards/rejected": -5.101526737213135, "sft_loss": 4.241372108459473, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 0.3600013132470304, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.5948072671890259, "logits/rejected": -0.4410117566585541, "logps/chosen": -4.5470781326293945, "logps/rejected": -5.075103282928467, "loss": 0.0523, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.5470781326293945, "rewards/margins": 0.528024435043335, "rewards/rejected": -5.075103282928467, "sft_loss": 4.2060041427612305, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 0.5094808818966736, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.35384225845336914, "logits/rejected": -0.19477471709251404, "logps/chosen": -4.505183219909668, "logps/rejected": -5.261287689208984, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.505183219909668, "rewards/margins": 0.7561042904853821, "rewards/rejected": -5.261287689208984, "sft_loss": 4.159135341644287, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 0.5864782893861172, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.49953025579452515, "logits/rejected": -0.3851977586746216, "logps/chosen": -4.435602188110352, "logps/rejected": -5.099587440490723, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.435602188110352, "rewards/margins": 0.6639851927757263, "rewards/rejected": -5.099587440490723, "sft_loss": 4.142877578735352, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 0.4774100379036388, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.5168375372886658, "logits/rejected": -0.3822152018547058, "logps/chosen": -4.490707874298096, "logps/rejected": -5.120068550109863, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.490707874298096, "rewards/margins": 0.6293607950210571, "rewards/rejected": -5.120068550109863, "sft_loss": 4.182126998901367, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 0.5003474642863843, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.262535035610199, "logits/rejected": -0.36348050832748413, "logps/chosen": -4.541584014892578, "logps/rejected": -5.141721725463867, "loss": 0.0519, "rewards/accuracies": 0.65625, "rewards/chosen": -4.541584014892578, "rewards/margins": 0.6001380681991577, "rewards/rejected": -5.141721725463867, "sft_loss": 4.230230331420898, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 0.7576816731387156, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.5238488912582397, "logits/rejected": -0.3342793881893158, "logps/chosen": -4.28743314743042, "logps/rejected": -4.850545883178711, "loss": 0.0534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.28743314743042, "rewards/margins": 0.5631122589111328, "rewards/rejected": -4.850545883178711, "sft_loss": 4.096673011779785, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 0.3851983061345649, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.39444658160209656, "logits/rejected": -0.2814629077911377, "logps/chosen": -4.476624488830566, "logps/rejected": -5.280016899108887, "loss": 0.0508, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.476624488830566, "rewards/margins": 0.803392767906189, "rewards/rejected": -5.280016899108887, "sft_loss": 4.236693859100342, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 0.47151248691810815, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.3986544609069824, "logits/rejected": -0.18574748933315277, "logps/chosen": -4.561751365661621, "logps/rejected": -5.0480241775512695, "loss": 0.052, "rewards/accuracies": 0.65625, "rewards/chosen": -4.561751365661621, "rewards/margins": 0.4862731993198395, "rewards/rejected": -5.0480241775512695, "sft_loss": 4.235430717468262, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 0.49421059769461206, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.2525349259376526, "logits/rejected": -0.20519249141216278, "logps/chosen": -4.362771034240723, "logps/rejected": -5.0366387367248535, "loss": 0.0531, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.362771034240723, "rewards/margins": 0.6738678216934204, "rewards/rejected": -5.0366387367248535, "sft_loss": 4.0702033042907715, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 0.4749868502685346, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.36834999918937683, "logits/rejected": -0.3293028771877289, "logps/chosen": -4.631998538970947, "logps/rejected": -4.988096237182617, "loss": 0.0519, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.631998538970947, "rewards/margins": 0.3560978174209595, "rewards/rejected": -4.988096237182617, "sft_loss": 4.213697910308838, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 0.4898756104693347, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.5464175343513489, "logits/rejected": -0.31873512268066406, "logps/chosen": -4.390994071960449, "logps/rejected": -5.097039222717285, "loss": 0.051, "rewards/accuracies": 0.71875, "rewards/chosen": -4.390994071960449, "rewards/margins": 0.7060455083847046, "rewards/rejected": -5.097039222717285, "sft_loss": 4.161412715911865, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 0.4561997507441212, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.5197268128395081, "logits/rejected": -0.39064133167266846, "logps/chosen": -4.570242881774902, "logps/rejected": -5.085498809814453, "loss": 0.0534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.570242881774902, "rewards/margins": 0.5152562260627747, "rewards/rejected": -5.085498809814453, "sft_loss": 4.3351335525512695, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 0.5648070703248154, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.4234795570373535, "logits/rejected": -0.21430261433124542, "logps/chosen": -4.63162088394165, "logps/rejected": -5.301953315734863, "loss": 0.0526, "rewards/accuracies": 0.65625, "rewards/chosen": -4.63162088394165, "rewards/margins": 0.6703327298164368, "rewards/rejected": -5.301953315734863, "sft_loss": 4.310842990875244, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.09203866124153137, "eval_logits/rejected": 0.20023885369300842, "eval_logps/chosen": -4.494305610656738, "eval_logps/rejected": -5.15374755859375, "eval_loss": 0.05033748596906662, "eval_rewards/accuracies": 0.6750741600990295, "eval_rewards/chosen": -4.494305610656738, "eval_rewards/margins": 0.6594412326812744, "eval_rewards/rejected": -5.15374755859375, "eval_runtime": 44.6023, "eval_samples_per_second": 30.155, "eval_sft_loss": 4.088565826416016, "eval_steps_per_second": 7.556, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 0.6012174576350848, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.4575120508670807, "logits/rejected": -0.3348286747932434, "logps/chosen": -4.449155330657959, "logps/rejected": -5.158486366271973, "loss": 0.0512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.449155330657959, "rewards/margins": 0.7093305587768555, "rewards/rejected": -5.158486366271973, "sft_loss": 4.231721878051758, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 0.4060400100494984, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.49344611167907715, "logits/rejected": -0.3567366898059845, "logps/chosen": -4.435418128967285, "logps/rejected": -5.056334495544434, "loss": 0.0518, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.435418128967285, "rewards/margins": 0.6209160089492798, "rewards/rejected": -5.056334495544434, "sft_loss": 4.158322811126709, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 0.440167549424582, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.35082921385765076, "logits/rejected": -0.24494728446006775, "logps/chosen": -4.462352752685547, "logps/rejected": -5.083427429199219, "loss": 0.052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.462352752685547, "rewards/margins": 0.6210747361183167, "rewards/rejected": -5.083427429199219, "sft_loss": 4.217515468597412, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 0.5313251152060041, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.5096575617790222, "logits/rejected": -0.15080931782722473, "logps/chosen": -4.45919942855835, "logps/rejected": -5.160483360290527, "loss": 0.0509, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.45919942855835, "rewards/margins": 0.7012836337089539, "rewards/rejected": -5.160483360290527, "sft_loss": 4.170629501342773, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 0.48971300206193963, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.3851815462112427, "logits/rejected": -0.22465872764587402, "logps/chosen": -4.357975959777832, "logps/rejected": -5.021440505981445, "loss": 0.0518, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.357975959777832, "rewards/margins": 0.6634647846221924, "rewards/rejected": -5.021440505981445, "sft_loss": 4.036141395568848, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 0.6060982224485721, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.4702853560447693, "logits/rejected": -0.2630925178527832, "logps/chosen": -4.5082268714904785, "logps/rejected": -5.44030237197876, "loss": 0.0508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.5082268714904785, "rewards/margins": 0.9320752024650574, "rewards/rejected": -5.44030237197876, "sft_loss": 4.23955774307251, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 0.5406642244933184, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.46813541650772095, "logits/rejected": -0.36023473739624023, "logps/chosen": -4.2994184494018555, "logps/rejected": -5.022599220275879, "loss": 0.0514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.2994184494018555, "rewards/margins": 0.7231807708740234, "rewards/rejected": -5.022599220275879, "sft_loss": 4.133441925048828, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 0.5646861972185111, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.40759220719337463, "logits/rejected": -0.21394577622413635, "logps/chosen": -4.382226467132568, "logps/rejected": -5.132818698883057, "loss": 0.0505, "rewards/accuracies": 0.75, "rewards/chosen": -4.382226467132568, "rewards/margins": 0.7505923509597778, "rewards/rejected": -5.132818698883057, "sft_loss": 4.103928565979004, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 1.1862512605254292, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.5004664659500122, "logits/rejected": -0.3564170002937317, "logps/chosen": -4.307888507843018, "logps/rejected": -5.052638053894043, "loss": 0.0515, "rewards/accuracies": 0.65625, "rewards/chosen": -4.307888507843018, "rewards/margins": 0.7447504997253418, "rewards/rejected": -5.052638053894043, "sft_loss": 4.0162553787231445, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 0.44291728240441197, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.34964537620544434, "logits/rejected": -0.12260773032903671, "logps/chosen": -4.081252574920654, "logps/rejected": -4.9591217041015625, "loss": 0.0499, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.081252574920654, "rewards/margins": 0.8778694272041321, "rewards/rejected": -4.9591217041015625, "sft_loss": 3.7887864112854004, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 0.4728837633840602, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.37212151288986206, "logits/rejected": -0.2665833532810211, "logps/chosen": -4.327893257141113, "logps/rejected": -5.067246913909912, "loss": 0.0506, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.327893257141113, "rewards/margins": 0.7393532991409302, "rewards/rejected": -5.067246913909912, "sft_loss": 4.035815238952637, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 0.6663758806493927, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.35964614152908325, "logits/rejected": -0.21862125396728516, "logps/chosen": -4.549116611480713, "logps/rejected": -5.252104759216309, "loss": 0.053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.549116611480713, "rewards/margins": 0.7029882669448853, "rewards/rejected": -5.252104759216309, "sft_loss": 4.308150291442871, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 0.6937239177601965, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.30185604095458984, "logits/rejected": -0.1327410191297531, "logps/chosen": -4.816344738006592, "logps/rejected": -5.434140682220459, "loss": 0.0528, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.816344738006592, "rewards/margins": 0.6177955865859985, "rewards/rejected": -5.434140682220459, "sft_loss": 4.486063480377197, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 0.5476027899361571, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.48964279890060425, "logits/rejected": -0.3335990309715271, "logps/chosen": -4.413262844085693, "logps/rejected": -5.010111331939697, "loss": 0.0523, "rewards/accuracies": 0.65625, "rewards/chosen": -4.413262844085693, "rewards/margins": 0.5968478322029114, "rewards/rejected": -5.010111331939697, "sft_loss": 4.17691707611084, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 0.4816724842744238, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.5395382642745972, "logits/rejected": -0.15304972231388092, "logps/chosen": -4.421982765197754, "logps/rejected": -5.152639865875244, "loss": 0.0504, "rewards/accuracies": 0.6875, "rewards/chosen": -4.421982765197754, "rewards/margins": 0.7306567430496216, "rewards/rejected": -5.152639865875244, "sft_loss": 4.0851616859436035, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 0.7336416483819101, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.4231603741645813, "logits/rejected": -0.38552913069725037, "logps/chosen": -4.532313346862793, "logps/rejected": -5.2002739906311035, "loss": 0.0516, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.532313346862793, "rewards/margins": 0.6679608225822449, "rewards/rejected": -5.2002739906311035, "sft_loss": 4.266327857971191, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 0.42548735895031536, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.48353782296180725, "logits/rejected": -0.22564320266246796, "logps/chosen": -4.283061981201172, "logps/rejected": -5.007942199707031, "loss": 0.0511, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.283061981201172, "rewards/margins": 0.724880039691925, "rewards/rejected": -5.007942199707031, "sft_loss": 3.973630905151367, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 0.4704875029146959, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.3822460472583771, "logits/rejected": -0.2654896378517151, "logps/chosen": -4.6180033683776855, "logps/rejected": -5.288962364196777, "loss": 0.0516, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.6180033683776855, "rewards/margins": 0.6709581613540649, "rewards/rejected": -5.288962364196777, "sft_loss": 4.231356143951416, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 0.5886824609412865, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.42286020517349243, "logits/rejected": -0.34611302614212036, "logps/chosen": -4.418186664581299, "logps/rejected": -5.195922374725342, "loss": 0.0505, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.418186664581299, "rewards/margins": 0.7777358293533325, "rewards/rejected": -5.195922374725342, "sft_loss": 4.094481945037842, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 0.4488080995370675, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.552751898765564, "logits/rejected": -0.40471887588500977, "logps/chosen": -4.430189609527588, "logps/rejected": -5.224255084991455, "loss": 0.0506, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.430189609527588, "rewards/margins": 0.7940656542778015, "rewards/rejected": -5.224255084991455, "sft_loss": 4.111329555511475, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 0.42602730666500793, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.5452768206596375, "logits/rejected": -0.3276907801628113, "logps/chosen": -4.437650680541992, "logps/rejected": -5.14551305770874, "loss": 0.0507, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.437650680541992, "rewards/margins": 0.7078622579574585, "rewards/rejected": -5.14551305770874, "sft_loss": 4.128241539001465, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 0.6390670976562808, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.35805755853652954, "logits/rejected": -0.33667880296707153, "logps/chosen": -4.5810627937316895, "logps/rejected": -5.146379470825195, "loss": 0.0521, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.5810627937316895, "rewards/margins": 0.5653164982795715, "rewards/rejected": -5.146379470825195, "sft_loss": 4.303463935852051, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 0.3752889320968409, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.5173454284667969, "logits/rejected": -0.3171837329864502, "logps/chosen": -4.367125511169434, "logps/rejected": -5.153825283050537, "loss": 0.0506, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.367125511169434, "rewards/margins": 0.7866994142532349, "rewards/rejected": -5.153825283050537, "sft_loss": 4.1063642501831055, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 0.5181567088724439, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.42447957396507263, "logits/rejected": -0.12471141666173935, "logps/chosen": -4.273622035980225, "logps/rejected": -5.119785308837891, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.273622035980225, "rewards/margins": 0.8461631536483765, "rewards/rejected": -5.119785308837891, "sft_loss": 4.039144992828369, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 0.39367776087848066, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.49922627210617065, "logits/rejected": -0.2579634487628937, "logps/chosen": -4.542778968811035, "logps/rejected": -5.047327995300293, "loss": 0.053, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.542778968811035, "rewards/margins": 0.5045495629310608, "rewards/rejected": -5.047327995300293, "sft_loss": 4.305514335632324, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 0.5683064013318566, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.4508441388607025, "logits/rejected": -0.32689857482910156, "logps/chosen": -4.481637001037598, "logps/rejected": -5.132130146026611, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.481637001037598, "rewards/margins": 0.6504932045936584, "rewards/rejected": -5.132130146026611, "sft_loss": 4.178994178771973, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 0.4845698813878629, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.4532322883605957, "logits/rejected": -0.2531808614730835, "logps/chosen": -4.547987461090088, "logps/rejected": -5.118274688720703, "loss": 0.0522, "rewards/accuracies": 0.6875, "rewards/chosen": -4.547987461090088, "rewards/margins": 0.5702873468399048, "rewards/rejected": -5.118274688720703, "sft_loss": 4.188256740570068, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 0.5898990946897025, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.4140399396419525, "logits/rejected": -0.18528516590595245, "logps/chosen": -4.273344039916992, "logps/rejected": -5.252946376800537, "loss": 0.0501, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.273344039916992, "rewards/margins": 0.9796028137207031, "rewards/rejected": -5.252946376800537, "sft_loss": 4.037383556365967, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 0.32678783019208785, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.46477198600769043, "logits/rejected": -0.18900129199028015, "logps/chosen": -4.3688130378723145, "logps/rejected": -4.982115745544434, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.3688130378723145, "rewards/margins": 0.6133025288581848, "rewards/rejected": -4.982115745544434, "sft_loss": 4.144090175628662, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 0.5801523203742189, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.5855122804641724, "logits/rejected": -0.41854292154312134, "logps/chosen": -4.312270164489746, "logps/rejected": -4.93634557723999, "loss": 0.0521, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.312270164489746, "rewards/margins": 0.6240752935409546, "rewards/rejected": -4.93634557723999, "sft_loss": 4.130881309509277, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 0.5156421662577778, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.3728213906288147, "logits/rejected": -0.3222619891166687, "logps/chosen": -4.4992995262146, "logps/rejected": -5.148545265197754, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.4992995262146, "rewards/margins": 0.6492457985877991, "rewards/rejected": -5.148545265197754, "sft_loss": 4.185006141662598, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 0.6314975765154605, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.4350285530090332, "logits/rejected": -0.21461530029773712, "logps/chosen": -4.306244373321533, "logps/rejected": -5.169526100158691, "loss": 0.0507, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.306244373321533, "rewards/margins": 0.86328125, "rewards/rejected": -5.169526100158691, "sft_loss": 4.004863739013672, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 0.34425815053776077, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.5348860621452332, "logits/rejected": -0.2687731385231018, "logps/chosen": -4.5383830070495605, "logps/rejected": -5.160616874694824, "loss": 0.0527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.5383830070495605, "rewards/margins": 0.6222342252731323, "rewards/rejected": -5.160616874694824, "sft_loss": 4.3074493408203125, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 0.44953749468074905, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.46770724654197693, "logits/rejected": -0.22437289357185364, "logps/chosen": -4.51656436920166, "logps/rejected": -5.041363716125488, "loss": 0.0524, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.51656436920166, "rewards/margins": 0.5247992277145386, "rewards/rejected": -5.041363716125488, "sft_loss": 4.19726037979126, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 0.7984227678138108, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.2765560746192932, "logits/rejected": -0.23209819197654724, "logps/chosen": -4.682974338531494, "logps/rejected": -5.259125232696533, "loss": 0.0531, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.682974338531494, "rewards/margins": 0.5761508941650391, "rewards/rejected": -5.259125232696533, "sft_loss": 4.388473987579346, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 0.5857258417880921, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.539823591709137, "logits/rejected": -0.2948388159275055, "logps/chosen": -4.513506889343262, "logps/rejected": -5.19875955581665, "loss": 0.0513, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.513506889343262, "rewards/margins": 0.685252845287323, "rewards/rejected": -5.19875955581665, "sft_loss": 4.188086986541748, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 0.41345892065077483, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.4432690739631653, "logits/rejected": -0.4649744927883148, "logps/chosen": -4.3670573234558105, "logps/rejected": -4.942203044891357, "loss": 0.0522, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.3670573234558105, "rewards/margins": 0.575145423412323, "rewards/rejected": -4.942203044891357, "sft_loss": 4.128762245178223, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 0.3807537260151555, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.517729640007019, "logits/rejected": -0.34260427951812744, "logps/chosen": -4.635345458984375, "logps/rejected": -5.255611419677734, "loss": 0.0526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.635345458984375, "rewards/margins": 0.6202660202980042, "rewards/rejected": -5.255611419677734, "sft_loss": 4.328315258026123, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 0.6093769811207337, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.602250337600708, "logits/rejected": -0.39874905347824097, "logps/chosen": -4.449357986450195, "logps/rejected": -5.050063610076904, "loss": 0.0505, "rewards/accuracies": 0.6875, "rewards/chosen": -4.449357986450195, "rewards/margins": 0.6007059812545776, "rewards/rejected": -5.050063610076904, "sft_loss": 4.129879474639893, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 0.5274513119746579, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.4147399961948395, "logits/rejected": -0.24769814312458038, "logps/chosen": -4.3410162925720215, "logps/rejected": -5.063584804534912, "loss": 0.0501, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.3410162925720215, "rewards/margins": 0.7225686311721802, "rewards/rejected": -5.063584804534912, "sft_loss": 3.992316484451294, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 0.8969781254234718, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.4810507297515869, "logits/rejected": -0.3219291567802429, "logps/chosen": -4.2392802238464355, "logps/rejected": -4.895898342132568, "loss": 0.0508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.2392802238464355, "rewards/margins": 0.6566182374954224, "rewards/rejected": -4.895898342132568, "sft_loss": 3.9705615043640137, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 0.5333461902605714, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.5034727454185486, "logits/rejected": -0.1998063623905182, "logps/chosen": -4.280855655670166, "logps/rejected": -5.128933906555176, "loss": 0.05, "rewards/accuracies": 0.75, "rewards/chosen": -4.280855655670166, "rewards/margins": 0.8480777740478516, "rewards/rejected": -5.128933906555176, "sft_loss": 3.991110324859619, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 0.7210380554678926, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.503537654876709, "logits/rejected": -0.36525729298591614, "logps/chosen": -4.475733280181885, "logps/rejected": -5.242091655731201, "loss": 0.0515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.475733280181885, "rewards/margins": 0.7663584351539612, "rewards/rejected": -5.242091655731201, "sft_loss": 4.2255539894104, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 0.5579904639421918, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.31057581305503845, "logits/rejected": -0.19793424010276794, "logps/chosen": -4.797817230224609, "logps/rejected": -5.580362796783447, "loss": 0.0542, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.797817230224609, "rewards/margins": 0.7825452089309692, "rewards/rejected": -5.580362796783447, "sft_loss": 4.528318881988525, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 0.6518358573852266, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.48437052965164185, "logits/rejected": -0.4504765570163727, "logps/chosen": -4.491649627685547, "logps/rejected": -5.1009202003479, "loss": 0.0509, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.491649627685547, "rewards/margins": 0.609270453453064, "rewards/rejected": -5.1009202003479, "sft_loss": 4.175753116607666, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 0.38888651929066054, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.4671882688999176, "logits/rejected": -0.33713823556900024, "logps/chosen": -4.218931198120117, "logps/rejected": -5.193713188171387, "loss": 0.0493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.218931198120117, "rewards/margins": 0.9747824668884277, "rewards/rejected": -5.193713188171387, "sft_loss": 3.91839599609375, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 0.45745043157722315, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.3741157650947571, "logits/rejected": -0.24425411224365234, "logps/chosen": -4.257815361022949, "logps/rejected": -4.900847434997559, "loss": 0.0507, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.257815361022949, "rewards/margins": 0.6430323719978333, "rewards/rejected": -4.900847434997559, "sft_loss": 3.8886466026306152, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 0.5181841104189797, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.463533490896225, "logits/rejected": -0.34041827917099, "logps/chosen": -4.538870334625244, "logps/rejected": -5.209536075592041, "loss": 0.0548, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.538870334625244, "rewards/margins": 0.670665442943573, "rewards/rejected": -5.209536075592041, "sft_loss": 4.259721755981445, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 0.7463694923350966, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.4446185231208801, "logits/rejected": -0.18138308823108673, "logps/chosen": -4.314180850982666, "logps/rejected": -5.201914310455322, "loss": 0.0509, "rewards/accuracies": 0.71875, "rewards/chosen": -4.314180850982666, "rewards/margins": 0.8877336382865906, "rewards/rejected": -5.201914310455322, "sft_loss": 4.067704200744629, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 0.7305408844038221, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.5151158571243286, "logits/rejected": -0.2955884337425232, "logps/chosen": -4.570265293121338, "logps/rejected": -5.323184967041016, "loss": 0.052, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.570265293121338, "rewards/margins": 0.7529199719429016, "rewards/rejected": -5.323184967041016, "sft_loss": 4.289970397949219, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 0.6296820490589102, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.456859290599823, "logits/rejected": -0.181920126080513, "logps/chosen": -4.556793689727783, "logps/rejected": -5.294541835784912, "loss": 0.0516, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.556793689727783, "rewards/margins": 0.7377482652664185, "rewards/rejected": -5.294541835784912, "sft_loss": 4.333632469177246, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 0.5484676674948301, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.31831836700439453, "logits/rejected": -0.17829912900924683, "logps/chosen": -4.420655727386475, "logps/rejected": -5.155341625213623, "loss": 0.0511, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.420655727386475, "rewards/margins": 0.7346860766410828, "rewards/rejected": -5.155341625213623, "sft_loss": 4.081669807434082, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 0.4479879814971447, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.4166865944862366, "logits/rejected": -0.3156904876232147, "logps/chosen": -4.233908176422119, "logps/rejected": -4.787781715393066, "loss": 0.0518, "rewards/accuracies": 0.65625, "rewards/chosen": -4.233908176422119, "rewards/margins": 0.5538742542266846, "rewards/rejected": -4.787781715393066, "sft_loss": 3.992141008377075, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 0.5758825106294463, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.4493212103843689, "logits/rejected": -0.36382657289505005, "logps/chosen": -4.502279281616211, "logps/rejected": -5.006508827209473, "loss": 0.0539, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.502279281616211, "rewards/margins": 0.5042295455932617, "rewards/rejected": -5.006508827209473, "sft_loss": 4.146180152893066, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 0.6106162004698598, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.4591015875339508, "logits/rejected": -0.4287734925746918, "logps/chosen": -4.42536735534668, "logps/rejected": -5.106713771820068, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.42536735534668, "rewards/margins": 0.6813467741012573, "rewards/rejected": -5.106713771820068, "sft_loss": 4.198317527770996, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 0.3729145732710297, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.32077568769454956, "logits/rejected": -0.311382919549942, "logps/chosen": -4.4926886558532715, "logps/rejected": -5.12239933013916, "loss": 0.0523, "rewards/accuracies": 0.71875, "rewards/chosen": -4.4926886558532715, "rewards/margins": 0.6297103762626648, "rewards/rejected": -5.12239933013916, "sft_loss": 4.272998809814453, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 0.49448934482541285, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.4439376890659332, "logits/rejected": -0.2607240676879883, "logps/chosen": -4.512078285217285, "logps/rejected": -5.103994369506836, "loss": 0.052, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.512078285217285, "rewards/margins": 0.5919159054756165, "rewards/rejected": -5.103994369506836, "sft_loss": 4.274177551269531, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 0.4570421660226491, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.36230507493019104, "logits/rejected": -0.2706686556339264, "logps/chosen": -4.175656318664551, "logps/rejected": -5.063697814941406, "loss": 0.05, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.175656318664551, "rewards/margins": 0.8880417943000793, "rewards/rejected": -5.063697814941406, "sft_loss": 3.9765090942382812, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 0.4449564164610078, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.5528706312179565, "logits/rejected": -0.3249674141407013, "logps/chosen": -4.501574516296387, "logps/rejected": -5.052069664001465, "loss": 0.0533, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.501574516296387, "rewards/margins": 0.5504950284957886, "rewards/rejected": -5.052069664001465, "sft_loss": 4.305262088775635, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 0.6068412485725911, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.35553085803985596, "logits/rejected": -0.16507504880428314, "logps/chosen": -4.477541923522949, "logps/rejected": -5.131036758422852, "loss": 0.0519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.477541923522949, "rewards/margins": 0.6534945964813232, "rewards/rejected": -5.131036758422852, "sft_loss": 4.123941421508789, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 0.417124958508913, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.4458470940589905, "logits/rejected": -0.25313228368759155, "logps/chosen": -4.447249412536621, "logps/rejected": -5.175409317016602, "loss": 0.0508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.447249412536621, "rewards/margins": 0.7281599640846252, "rewards/rejected": -5.175409317016602, "sft_loss": 4.110104084014893, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 0.5824911789053943, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.4364047944545746, "logits/rejected": -0.34159305691719055, "logps/chosen": -4.4093122482299805, "logps/rejected": -5.021022319793701, "loss": 0.0514, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.4093122482299805, "rewards/margins": 0.6117098331451416, "rewards/rejected": -5.021022319793701, "sft_loss": 4.1440229415893555, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 0.5231461376544928, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.42022261023521423, "logits/rejected": -0.18159890174865723, "logps/chosen": -4.356125831604004, "logps/rejected": -5.121804237365723, "loss": 0.0514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.356125831604004, "rewards/margins": 0.7656790018081665, "rewards/rejected": -5.121804237365723, "sft_loss": 4.129373550415039, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 0.4080394786532409, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.44390755891799927, "logits/rejected": -0.237187460064888, "logps/chosen": -4.44614315032959, "logps/rejected": -5.215703010559082, "loss": 0.051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.44614315032959, "rewards/margins": 0.7695599794387817, "rewards/rejected": -5.215703010559082, "sft_loss": 4.167542457580566, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 1.1253008447763322, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.33227282762527466, "logits/rejected": -0.08558958768844604, "logps/chosen": -4.393829345703125, "logps/rejected": -5.135227680206299, "loss": 0.0521, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.393829345703125, "rewards/margins": 0.7413985133171082, "rewards/rejected": -5.135227680206299, "sft_loss": 4.12273645401001, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 0.3619456418861775, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.32817691564559937, "logits/rejected": -0.26207447052001953, "logps/chosen": -4.479277610778809, "logps/rejected": -4.9810686111450195, "loss": 0.0521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.479277610778809, "rewards/margins": 0.5017910599708557, "rewards/rejected": -4.9810686111450195, "sft_loss": 4.196653842926025, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 0.8233495991712592, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.48102402687072754, "logits/rejected": -0.3378520607948303, "logps/chosen": -4.401326656341553, "logps/rejected": -5.100113868713379, "loss": 0.0507, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.401326656341553, "rewards/margins": 0.6987876892089844, "rewards/rejected": -5.100113868713379, "sft_loss": 4.070215225219727, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 0.4385078125633207, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.4696807861328125, "logits/rejected": -0.17892661690711975, "logps/chosen": -4.4610276222229, "logps/rejected": -5.119901657104492, "loss": 0.0523, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.4610276222229, "rewards/margins": 0.6588743925094604, "rewards/rejected": -5.119901657104492, "sft_loss": 4.196292400360107, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 0.8458093070187234, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.5620048642158508, "logits/rejected": -0.19954653084278107, "logps/chosen": -4.403501033782959, "logps/rejected": -5.049036502838135, "loss": 0.0502, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.403501033782959, "rewards/margins": 0.6455354690551758, "rewards/rejected": -5.049036502838135, "sft_loss": 4.065199851989746, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 0.4704386506569429, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.2901856303215027, "logits/rejected": -0.190904900431633, "logps/chosen": -4.472724914550781, "logps/rejected": -5.0304365158081055, "loss": 0.0528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.472724914550781, "rewards/margins": 0.5577119588851929, "rewards/rejected": -5.0304365158081055, "sft_loss": 4.144157886505127, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 0.6006091165206281, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.432685911655426, "logits/rejected": -0.2743949294090271, "logps/chosen": -4.504183769226074, "logps/rejected": -5.2633185386657715, "loss": 0.0513, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.504183769226074, "rewards/margins": 0.7591356635093689, "rewards/rejected": -5.2633185386657715, "sft_loss": 4.193944454193115, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 0.721435017311742, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.39169448614120483, "logits/rejected": -0.26618900895118713, "logps/chosen": -4.552404880523682, "logps/rejected": -5.208046913146973, "loss": 0.0517, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.552404880523682, "rewards/margins": 0.6556424498558044, "rewards/rejected": -5.208046913146973, "sft_loss": 4.27425479888916, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 0.4939259198559468, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.33243894577026367, "logits/rejected": -0.23660437762737274, "logps/chosen": -4.485566139221191, "logps/rejected": -5.194786071777344, "loss": 0.0506, "rewards/accuracies": 0.65625, "rewards/chosen": -4.485566139221191, "rewards/margins": 0.7092195749282837, "rewards/rejected": -5.194786071777344, "sft_loss": 4.126955509185791, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 0.5249982928488253, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.4098309874534607, "logits/rejected": -0.1714651882648468, "logps/chosen": -4.175349712371826, "logps/rejected": -5.012117385864258, "loss": 0.0487, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.175349712371826, "rewards/margins": 0.8367677927017212, "rewards/rejected": -5.012117385864258, "sft_loss": 3.794126510620117, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 0.5075978035965267, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.26183438301086426, "logits/rejected": -0.12572081387043, "logps/chosen": -4.214966297149658, "logps/rejected": -5.030123710632324, "loss": 0.0508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.214966297149658, "rewards/margins": 0.8151571154594421, "rewards/rejected": -5.030123710632324, "sft_loss": 3.8706488609313965, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 0.43746094254176354, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.23620197176933289, "logits/rejected": -0.1973074972629547, "logps/chosen": -4.420254707336426, "logps/rejected": -5.177483558654785, "loss": 0.0514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.420254707336426, "rewards/margins": 0.7572286128997803, "rewards/rejected": -5.177483558654785, "sft_loss": 4.062037467956543, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 0.5481978185722167, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.1542886197566986, "logits/rejected": -0.11566226184368134, "logps/chosen": -4.590366840362549, "logps/rejected": -5.307346343994141, "loss": 0.0512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.590366840362549, "rewards/margins": 0.7169798612594604, "rewards/rejected": -5.307346343994141, "sft_loss": 4.328564167022705, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 1.1503822120526854, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.2250882089138031, "logits/rejected": -0.185527965426445, "logps/chosen": -4.529903411865234, "logps/rejected": -5.08640718460083, "loss": 0.0539, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.529903411865234, "rewards/margins": 0.5565038323402405, "rewards/rejected": -5.08640718460083, "sft_loss": 4.295632839202881, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 0.8891651435865691, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.3869754374027252, "logits/rejected": -0.1528097689151764, "logps/chosen": -4.39151668548584, "logps/rejected": -5.24567174911499, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.39151668548584, "rewards/margins": 0.8541552424430847, "rewards/rejected": -5.24567174911499, "sft_loss": 4.075733184814453, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 0.681986111329172, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.3255468010902405, "logits/rejected": -0.09248501062393188, "logps/chosen": -4.518547058105469, "logps/rejected": -5.2352681159973145, "loss": 0.0533, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.518547058105469, "rewards/margins": 0.7167209982872009, "rewards/rejected": -5.2352681159973145, "sft_loss": 4.26555871963501, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.042076222598552704, "eval_logits/rejected": 0.13484692573547363, "eval_logps/chosen": -4.380853652954102, "eval_logps/rejected": -5.1003336906433105, "eval_loss": 0.05012309178709984, "eval_rewards/accuracies": 0.6824925541877747, "eval_rewards/chosen": -4.380853652954102, "eval_rewards/margins": 0.7194797992706299, "eval_rewards/rejected": -5.1003336906433105, "eval_runtime": 53.2854, "eval_samples_per_second": 25.241, "eval_sft_loss": 3.985746145248413, "eval_steps_per_second": 6.324, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 0.5549631315016357, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.413614422082901, "logits/rejected": -0.4114568829536438, "logps/chosen": -4.466488361358643, "logps/rejected": -5.028023719787598, "loss": 0.0521, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.466488361358643, "rewards/margins": 0.5615357160568237, "rewards/rejected": -5.028023719787598, "sft_loss": 4.167389392852783, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 0.3925853049073582, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.4991392493247986, "logits/rejected": -0.3091749846935272, "logps/chosen": -4.473031044006348, "logps/rejected": -5.131730556488037, "loss": 0.0518, "rewards/accuracies": 0.71875, "rewards/chosen": -4.473031044006348, "rewards/margins": 0.6587000489234924, "rewards/rejected": -5.131730556488037, "sft_loss": 4.212399482727051, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 0.4248304376205085, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.3598117232322693, "logits/rejected": -0.2084875851869583, "logps/chosen": -4.257334232330322, "logps/rejected": -4.964080810546875, "loss": 0.052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.257334232330322, "rewards/margins": 0.7067463397979736, "rewards/rejected": -4.964080810546875, "sft_loss": 4.068849563598633, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 0.6211146011480987, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.4285960793495178, "logits/rejected": -0.3357129395008087, "logps/chosen": -4.479846000671387, "logps/rejected": -4.998563766479492, "loss": 0.0536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.479846000671387, "rewards/margins": 0.5187180638313293, "rewards/rejected": -4.998563766479492, "sft_loss": 4.278697490692139, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 0.5518821657498258, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.2991160452365875, "logits/rejected": -0.3689224123954773, "logps/chosen": -4.462956428527832, "logps/rejected": -4.9926347732543945, "loss": 0.0516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.462956428527832, "rewards/margins": 0.529678463935852, "rewards/rejected": -4.9926347732543945, "sft_loss": 4.177762985229492, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 0.4413001057484683, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.418252557516098, "logits/rejected": -0.40913906693458557, "logps/chosen": -4.441468715667725, "logps/rejected": -5.178438186645508, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.441468715667725, "rewards/margins": 0.7369694709777832, "rewards/rejected": -5.178438186645508, "sft_loss": 4.229043006896973, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 0.43302853267366176, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.49432092905044556, "logits/rejected": -0.35859179496765137, "logps/chosen": -4.379761219024658, "logps/rejected": -5.066256523132324, "loss": 0.052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.379761219024658, "rewards/margins": 0.6864956617355347, "rewards/rejected": -5.066256523132324, "sft_loss": 4.174038887023926, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 0.40957106935824633, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.4507400393486023, "logits/rejected": -0.3858460783958435, "logps/chosen": -4.515069961547852, "logps/rejected": -4.925932884216309, "loss": 0.053, "rewards/accuracies": 0.6875, "rewards/chosen": -4.515069961547852, "rewards/margins": 0.41086310148239136, "rewards/rejected": -4.925932884216309, "sft_loss": 4.25916862487793, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 0.3396296787353694, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.4683416485786438, "logits/rejected": -0.3325764536857605, "logps/chosen": -4.533778190612793, "logps/rejected": -5.040543079376221, "loss": 0.0518, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.533778190612793, "rewards/margins": 0.5067647695541382, "rewards/rejected": -5.040543079376221, "sft_loss": 4.286801815032959, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 0.45233647238819286, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.4944976270198822, "logits/rejected": -0.3757014870643616, "logps/chosen": -4.554055213928223, "logps/rejected": -5.018446445465088, "loss": 0.0534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.554055213928223, "rewards/margins": 0.4643916189670563, "rewards/rejected": -5.018446445465088, "sft_loss": 4.320440292358398, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 0.5370729215061321, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.3831983208656311, "logits/rejected": -0.26895269751548767, "logps/chosen": -4.414761543273926, "logps/rejected": -5.030882358551025, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.414761543273926, "rewards/margins": 0.6161209344863892, "rewards/rejected": -5.030882358551025, "sft_loss": 4.083725929260254, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 0.772444816008, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.39643681049346924, "logits/rejected": -0.23135873675346375, "logps/chosen": -4.39790153503418, "logps/rejected": -5.099810600280762, "loss": 0.0518, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.39790153503418, "rewards/margins": 0.7019084095954895, "rewards/rejected": -5.099810600280762, "sft_loss": 4.100924491882324, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 0.5423150933685754, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.4202393591403961, "logits/rejected": -0.3306739330291748, "logps/chosen": -4.583292007446289, "logps/rejected": -5.248047828674316, "loss": 0.0513, "rewards/accuracies": 0.625, "rewards/chosen": -4.583292007446289, "rewards/margins": 0.6647554636001587, "rewards/rejected": -5.248047828674316, "sft_loss": 4.254556179046631, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 0.5804743436026095, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.4745996594429016, "logits/rejected": -0.3122026324272156, "logps/chosen": -4.478249549865723, "logps/rejected": -5.229377746582031, "loss": 0.0523, "rewards/accuracies": 0.71875, "rewards/chosen": -4.478249549865723, "rewards/margins": 0.7511278390884399, "rewards/rejected": -5.229377746582031, "sft_loss": 4.368980884552002, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 0.38822005969042495, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.4124499261379242, "logits/rejected": -0.25161346793174744, "logps/chosen": -4.488820552825928, "logps/rejected": -5.219266414642334, "loss": 0.0525, "rewards/accuracies": 0.6875, "rewards/chosen": -4.488820552825928, "rewards/margins": 0.7304463982582092, "rewards/rejected": -5.219266414642334, "sft_loss": 4.217720985412598, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 0.6205345104070085, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.36543330550193787, "logits/rejected": -0.25120458006858826, "logps/chosen": -4.348297119140625, "logps/rejected": -4.99812126159668, "loss": 0.0503, "rewards/accuracies": 0.6875, "rewards/chosen": -4.348297119140625, "rewards/margins": 0.6498240232467651, "rewards/rejected": -4.99812126159668, "sft_loss": 3.9996657371520996, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 0.4684774215516626, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.4492974877357483, "logits/rejected": -0.3519330322742462, "logps/chosen": -4.489088535308838, "logps/rejected": -5.102923393249512, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.489088535308838, "rewards/margins": 0.6138354539871216, "rewards/rejected": -5.102923393249512, "sft_loss": 4.269659519195557, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 0.4352793387940146, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.4023476243019104, "logits/rejected": -0.18105700612068176, "logps/chosen": -4.410039901733398, "logps/rejected": -5.142436981201172, "loss": 0.0508, "rewards/accuracies": 0.71875, "rewards/chosen": -4.410039901733398, "rewards/margins": 0.7323965430259705, "rewards/rejected": -5.142436981201172, "sft_loss": 4.056788921356201, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 0.7221959376956284, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.38285067677497864, "logits/rejected": -0.2815985679626465, "logps/chosen": -4.474632740020752, "logps/rejected": -5.1458845138549805, "loss": 0.0513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.474632740020752, "rewards/margins": 0.6712522506713867, "rewards/rejected": -5.1458845138549805, "sft_loss": 4.184819221496582, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 0.42364467886896584, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.3717033267021179, "logits/rejected": -0.2091069519519806, "logps/chosen": -4.41146993637085, "logps/rejected": -5.03031587600708, "loss": 0.0527, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.41146993637085, "rewards/margins": 0.6188455820083618, "rewards/rejected": -5.03031587600708, "sft_loss": 4.186913967132568, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 0.503213529046553, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.3578525185585022, "logits/rejected": -0.2924673557281494, "logps/chosen": -4.406094074249268, "logps/rejected": -5.103278160095215, "loss": 0.051, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.406094074249268, "rewards/margins": 0.6971846222877502, "rewards/rejected": -5.103278160095215, "sft_loss": 4.0739641189575195, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 0.38135488875923174, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.22055137157440186, "logits/rejected": -0.2213733196258545, "logps/chosen": -4.404345989227295, "logps/rejected": -5.086258888244629, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.404345989227295, "rewards/margins": 0.6819120049476624, "rewards/rejected": -5.086258888244629, "sft_loss": 4.12283992767334, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 0.3876450118385628, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.40424853563308716, "logits/rejected": -0.36117544770240784, "logps/chosen": -4.4008331298828125, "logps/rejected": -5.137930870056152, "loss": 0.0514, "rewards/accuracies": 0.71875, "rewards/chosen": -4.4008331298828125, "rewards/margins": 0.7370980381965637, "rewards/rejected": -5.137930870056152, "sft_loss": 4.242586612701416, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 0.5861867090556139, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.41305360198020935, "logits/rejected": -0.2922312617301941, "logps/chosen": -4.416529655456543, "logps/rejected": -5.068317413330078, "loss": 0.0508, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.416529655456543, "rewards/margins": 0.6517875790596008, "rewards/rejected": -5.068317413330078, "sft_loss": 4.030583381652832, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 0.5166229517646469, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.45256465673446655, "logits/rejected": -0.35922300815582275, "logps/chosen": -4.412965774536133, "logps/rejected": -5.209670066833496, "loss": 0.0516, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.412965774536133, "rewards/margins": 0.7967040538787842, "rewards/rejected": -5.209670066833496, "sft_loss": 4.189126014709473, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 0.6824688697780613, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.34892401099205017, "logits/rejected": -0.196553036570549, "logps/chosen": -4.41003942489624, "logps/rejected": -5.10931921005249, "loss": 0.0512, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.41003942489624, "rewards/margins": 0.6992799043655396, "rewards/rejected": -5.10931921005249, "sft_loss": 4.171704292297363, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 1.0901331051063583, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.3078027069568634, "logits/rejected": -0.23698917031288147, "logps/chosen": -4.411991596221924, "logps/rejected": -5.105666637420654, "loss": 0.0524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.411991596221924, "rewards/margins": 0.6936756372451782, "rewards/rejected": -5.105666637420654, "sft_loss": 4.146667003631592, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 0.5188798925629451, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.3119719922542572, "logits/rejected": -0.2788045406341553, "logps/chosen": -4.284983158111572, "logps/rejected": -4.85286283493042, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.284983158111572, "rewards/margins": 0.5678800344467163, "rewards/rejected": -4.85286283493042, "sft_loss": 3.9691109657287598, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 0.4564540243960645, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.2935374677181244, "logits/rejected": -0.216166689991951, "logps/chosen": -4.339650630950928, "logps/rejected": -5.14935827255249, "loss": 0.0491, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.339650630950928, "rewards/margins": 0.8097079396247864, "rewards/rejected": -5.14935827255249, "sft_loss": 4.013418674468994, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 0.6048823479276922, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.41443270444869995, "logits/rejected": -0.29372677206993103, "logps/chosen": -4.4289727210998535, "logps/rejected": -5.105106830596924, "loss": 0.052, "rewards/accuracies": 0.71875, "rewards/chosen": -4.4289727210998535, "rewards/margins": 0.6761346459388733, "rewards/rejected": -5.105106830596924, "sft_loss": 4.135364532470703, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 0.5046145294593994, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.35901492834091187, "logits/rejected": -0.1820385754108429, "logps/chosen": -4.4694504737854, "logps/rejected": -5.2601518630981445, "loss": 0.052, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.4694504737854, "rewards/margins": 0.7907018065452576, "rewards/rejected": -5.2601518630981445, "sft_loss": 4.254968166351318, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 0.29403349636257936, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.42322462797164917, "logits/rejected": -0.38704365491867065, "logps/chosen": -4.426640510559082, "logps/rejected": -5.285890102386475, "loss": 0.0505, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.426640510559082, "rewards/margins": 0.8592498898506165, "rewards/rejected": -5.285890102386475, "sft_loss": 4.212055206298828, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 0.4409370642018633, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.4164283871650696, "logits/rejected": -0.16731376945972443, "logps/chosen": -4.487136363983154, "logps/rejected": -5.313013076782227, "loss": 0.05, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.487136363983154, "rewards/margins": 0.8258762359619141, "rewards/rejected": -5.313013076782227, "sft_loss": 4.19671630859375, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 0.4638398222614792, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.4540809690952301, "logits/rejected": -0.242467001080513, "logps/chosen": -4.405378341674805, "logps/rejected": -5.238399028778076, "loss": 0.05, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.405378341674805, "rewards/margins": 0.8330209851264954, "rewards/rejected": -5.238399028778076, "sft_loss": 4.111074447631836, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 0.43954904547247514, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.36221879720687866, "logits/rejected": -0.3303019106388092, "logps/chosen": -4.343339920043945, "logps/rejected": -5.14573860168457, "loss": 0.0506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.343339920043945, "rewards/margins": 0.8023991584777832, "rewards/rejected": -5.14573860168457, "sft_loss": 4.104333877563477, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 0.5069367462388984, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.31020763516426086, "logits/rejected": -0.31572312116622925, "logps/chosen": -4.166377067565918, "logps/rejected": -4.930800437927246, "loss": 0.0498, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.166377067565918, "rewards/margins": 0.7644233703613281, "rewards/rejected": -4.930800437927246, "sft_loss": 3.880671977996826, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 0.5671140716617928, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.37395811080932617, "logits/rejected": -0.17182935774326324, "logps/chosen": -4.164129734039307, "logps/rejected": -4.959773063659668, "loss": 0.0502, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.164129734039307, "rewards/margins": 0.7956432104110718, "rewards/rejected": -4.959773063659668, "sft_loss": 3.9170658588409424, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 0.5316215456404498, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.4127592444419861, "logits/rejected": -0.2285337895154953, "logps/chosen": -4.2911787033081055, "logps/rejected": -5.131359100341797, "loss": 0.0516, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.2911787033081055, "rewards/margins": 0.8401795625686646, "rewards/rejected": -5.131359100341797, "sft_loss": 4.116988658905029, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 0.5071073040531272, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.275922954082489, "logits/rejected": -0.2805738151073456, "logps/chosen": -4.463419437408447, "logps/rejected": -5.0827202796936035, "loss": 0.0508, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.463419437408447, "rewards/margins": 0.6193008422851562, "rewards/rejected": -5.0827202796936035, "sft_loss": 4.152365207672119, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 0.6290278382399703, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.31285542249679565, "logits/rejected": -0.14409422874450684, "logps/chosen": -4.381528377532959, "logps/rejected": -5.171832084655762, "loss": 0.0512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.381528377532959, "rewards/margins": 0.790303111076355, "rewards/rejected": -5.171832084655762, "sft_loss": 4.130490303039551, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 0.4750673772059999, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.3556492030620575, "logits/rejected": -0.19416716694831848, "logps/chosen": -4.3436150550842285, "logps/rejected": -5.072421073913574, "loss": 0.0511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.3436150550842285, "rewards/margins": 0.7288060188293457, "rewards/rejected": -5.072421073913574, "sft_loss": 4.075199127197266, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 0.561423135841397, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.19012321531772614, "logits/rejected": -0.11848640441894531, "logps/chosen": -4.485342979431152, "logps/rejected": -5.298763751983643, "loss": 0.0503, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.485342979431152, "rewards/margins": 0.8134201169013977, "rewards/rejected": -5.298763751983643, "sft_loss": 4.0772199630737305, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 0.4536463672222037, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.42413783073425293, "logits/rejected": -0.32068854570388794, "logps/chosen": -4.322854995727539, "logps/rejected": -5.041677474975586, "loss": 0.0495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.322854995727539, "rewards/margins": 0.7188224792480469, "rewards/rejected": -5.041677474975586, "sft_loss": 4.019509315490723, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 0.42784473263060707, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.4775485396385193, "logits/rejected": -0.22211973369121552, "logps/chosen": -4.356131076812744, "logps/rejected": -5.343842506408691, "loss": 0.0497, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.356131076812744, "rewards/margins": 0.9877112507820129, "rewards/rejected": -5.343842506408691, "sft_loss": 4.108757019042969, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 0.591812952535654, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.38019734621047974, "logits/rejected": -0.32415252923965454, "logps/chosen": -4.308173179626465, "logps/rejected": -5.0770392417907715, "loss": 0.0514, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.308173179626465, "rewards/margins": 0.7688660025596619, "rewards/rejected": -5.0770392417907715, "sft_loss": 4.112570285797119, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 0.6807512245577299, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.4845009744167328, "logits/rejected": -0.31688985228538513, "logps/chosen": -4.246060371398926, "logps/rejected": -5.092269420623779, "loss": 0.0495, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.246060371398926, "rewards/margins": 0.8462090492248535, "rewards/rejected": -5.092269420623779, "sft_loss": 3.8919384479522705, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 0.47888776950849704, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.3477453589439392, "logits/rejected": -0.1463039219379425, "logps/chosen": -4.308255195617676, "logps/rejected": -5.2045207023620605, "loss": 0.0509, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.308255195617676, "rewards/margins": 0.8962651491165161, "rewards/rejected": -5.2045207023620605, "sft_loss": 4.06205415725708, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 0.7940416668853761, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.27007609605789185, "logits/rejected": -0.10254746675491333, "logps/chosen": -4.4379963874816895, "logps/rejected": -5.277739524841309, "loss": 0.0517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.4379963874816895, "rewards/margins": 0.8397432565689087, "rewards/rejected": -5.277739524841309, "sft_loss": 4.179162502288818, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 0.5044049210520891, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.41755181550979614, "logits/rejected": -0.15795069932937622, "logps/chosen": -4.296069622039795, "logps/rejected": -5.277471542358398, "loss": 0.0494, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.296069622039795, "rewards/margins": 0.981401801109314, "rewards/rejected": -5.277471542358398, "sft_loss": 4.078981876373291, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 0.5570283652118182, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.4102238118648529, "logits/rejected": -0.18241597712039948, "logps/chosen": -4.257302284240723, "logps/rejected": -5.118211269378662, "loss": 0.0512, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.257302284240723, "rewards/margins": 0.8609098196029663, "rewards/rejected": -5.118211269378662, "sft_loss": 4.112555503845215, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 0.5648628736720877, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.32026559114456177, "logits/rejected": -0.08667844533920288, "logps/chosen": -4.301587104797363, "logps/rejected": -5.258988857269287, "loss": 0.0502, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.301587104797363, "rewards/margins": 0.9574017524719238, "rewards/rejected": -5.258988857269287, "sft_loss": 4.079500675201416, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 1.2682819436152235, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.2664087414741516, "logits/rejected": -0.10435505211353302, "logps/chosen": -4.1911211013793945, "logps/rejected": -5.21523380279541, "loss": 0.0512, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.1911211013793945, "rewards/margins": 1.0241124629974365, "rewards/rejected": -5.21523380279541, "sft_loss": 3.9725563526153564, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 0.5482276671902016, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.4165882468223572, "logits/rejected": -0.23353514075279236, "logps/chosen": -4.504220008850098, "logps/rejected": -5.191248893737793, "loss": 0.0513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.504220008850098, "rewards/margins": 0.6870293617248535, "rewards/rejected": -5.191248893737793, "sft_loss": 4.223007678985596, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 0.7151427005992033, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.41871920228004456, "logits/rejected": -0.31977134943008423, "logps/chosen": -4.269133567810059, "logps/rejected": -5.074051380157471, "loss": 0.05, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.269133567810059, "rewards/margins": 0.8049181699752808, "rewards/rejected": -5.074051380157471, "sft_loss": 4.028173923492432, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 0.5973736394357382, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.3259030282497406, "logits/rejected": -0.230748251080513, "logps/chosen": -4.2867631912231445, "logps/rejected": -4.970796585083008, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.2867631912231445, "rewards/margins": 0.6840331554412842, "rewards/rejected": -4.970796585083008, "sft_loss": 3.9719643592834473, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 0.5511602661782866, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.3292701542377472, "logits/rejected": -0.1453513205051422, "logps/chosen": -4.196750640869141, "logps/rejected": -5.174393653869629, "loss": 0.0477, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.196750640869141, "rewards/margins": 0.9776426553726196, "rewards/rejected": -5.174393653869629, "sft_loss": 3.8621106147766113, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 0.5512517052338067, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.34448251128196716, "logits/rejected": -0.22929009795188904, "logps/chosen": -4.3367204666137695, "logps/rejected": -5.269116401672363, "loss": 0.0519, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.3367204666137695, "rewards/margins": 0.9323955774307251, "rewards/rejected": -5.269116401672363, "sft_loss": 4.058518409729004, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 0.46558012602017285, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.25459569692611694, "logits/rejected": -0.14168240129947662, "logps/chosen": -4.452088356018066, "logps/rejected": -5.236243724822998, "loss": 0.0505, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.452088356018066, "rewards/margins": 0.7841559648513794, "rewards/rejected": -5.236243724822998, "sft_loss": 4.1351799964904785, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 0.6347829987966642, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.4141598343849182, "logits/rejected": -0.1650456190109253, "logps/chosen": -4.456624507904053, "logps/rejected": -5.327982425689697, "loss": 0.0493, "rewards/accuracies": 0.75, "rewards/chosen": -4.456624507904053, "rewards/margins": 0.8713573217391968, "rewards/rejected": -5.327982425689697, "sft_loss": 4.143157958984375, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 0.3941280329888256, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.28264540433883667, "logits/rejected": -0.29245123267173767, "logps/chosen": -4.255073070526123, "logps/rejected": -5.082621097564697, "loss": 0.0499, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.255073070526123, "rewards/margins": 0.827547550201416, "rewards/rejected": -5.082621097564697, "sft_loss": 4.036890983581543, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 0.4219232787295329, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.309459388256073, "logits/rejected": -0.24101197719573975, "logps/chosen": -4.4378767013549805, "logps/rejected": -5.039052486419678, "loss": 0.0523, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.4378767013549805, "rewards/margins": 0.6011752486228943, "rewards/rejected": -5.039052486419678, "sft_loss": 4.130825042724609, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 0.482405415426906, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.31963956356048584, "logits/rejected": -0.180974081158638, "logps/chosen": -4.413501739501953, "logps/rejected": -5.067274570465088, "loss": 0.0514, "rewards/accuracies": 0.65625, "rewards/chosen": -4.413501739501953, "rewards/margins": 0.6537727117538452, "rewards/rejected": -5.067274570465088, "sft_loss": 4.154252529144287, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 0.3865437479050332, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.44079723954200745, "logits/rejected": -0.24643990397453308, "logps/chosen": -4.45873498916626, "logps/rejected": -5.161642074584961, "loss": 0.052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.45873498916626, "rewards/margins": 0.7029072046279907, "rewards/rejected": -5.161642074584961, "sft_loss": 4.244243144989014, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 0.4862165718551747, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.3659209907054901, "logits/rejected": -0.1901930421590805, "logps/chosen": -4.335224151611328, "logps/rejected": -5.234259605407715, "loss": 0.0499, "rewards/accuracies": 0.71875, "rewards/chosen": -4.335224151611328, "rewards/margins": 0.8990362286567688, "rewards/rejected": -5.234259605407715, "sft_loss": 4.113430500030518, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 0.589991768059557, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.35370510816574097, "logits/rejected": -0.28823089599609375, "logps/chosen": -4.255982398986816, "logps/rejected": -5.0965423583984375, "loss": 0.0497, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.255982398986816, "rewards/margins": 0.8405606150627136, "rewards/rejected": -5.0965423583984375, "sft_loss": 3.981487274169922, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 0.6335308159808326, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.46438127756118774, "logits/rejected": -0.2801642417907715, "logps/chosen": -4.140517234802246, "logps/rejected": -5.057478427886963, "loss": 0.05, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.140517234802246, "rewards/margins": 0.9169610142707825, "rewards/rejected": -5.057478427886963, "sft_loss": 3.9590744972229004, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 0.7515168620245619, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.3501364290714264, "logits/rejected": -0.2728483974933624, "logps/chosen": -4.320809364318848, "logps/rejected": -5.093451499938965, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.320809364318848, "rewards/margins": 0.7726426720619202, "rewards/rejected": -5.093451499938965, "sft_loss": 4.071271896362305, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 0.6183918453023575, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.3462718427181244, "logits/rejected": -0.2552764415740967, "logps/chosen": -4.334949970245361, "logps/rejected": -5.172145366668701, "loss": 0.0507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.334949970245361, "rewards/margins": 0.8371955752372742, "rewards/rejected": -5.172145366668701, "sft_loss": 4.093982219696045, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 0.7821774872769638, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.39049679040908813, "logits/rejected": -0.17837652564048767, "logps/chosen": -4.518766403198242, "logps/rejected": -5.061732292175293, "loss": 0.0538, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.518766403198242, "rewards/margins": 0.542966365814209, "rewards/rejected": -5.061732292175293, "sft_loss": 4.259708404541016, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 0.4422456622841627, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.2510150372982025, "logits/rejected": -0.09285564720630646, "logps/chosen": -4.417247772216797, "logps/rejected": -5.220688819885254, "loss": 0.0506, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.417247772216797, "rewards/margins": 0.8034406900405884, "rewards/rejected": -5.220688819885254, "sft_loss": 4.077648639678955, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 0.5447054778045387, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.37670284509658813, "logits/rejected": -0.33116498589515686, "logps/chosen": -4.425228118896484, "logps/rejected": -5.350491046905518, "loss": 0.0509, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.425228118896484, "rewards/margins": 0.9252630472183228, "rewards/rejected": -5.350491046905518, "sft_loss": 4.21567440032959, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 0.6839068942676009, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.332570344209671, "logits/rejected": -0.22967591881752014, "logps/chosen": -4.309033393859863, "logps/rejected": -5.366685390472412, "loss": 0.0505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.309033393859863, "rewards/margins": 1.0576521158218384, "rewards/rejected": -5.366685390472412, "sft_loss": 4.095768928527832, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 0.539794604604235, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.3633222281932831, "logits/rejected": -0.20267243683338165, "logps/chosen": -4.284216403961182, "logps/rejected": -5.179417610168457, "loss": 0.0499, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.284216403961182, "rewards/margins": 0.8952015042304993, "rewards/rejected": -5.179417610168457, "sft_loss": 4.089019298553467, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 0.6650863893790436, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.2811715602874756, "logits/rejected": -0.19412469863891602, "logps/chosen": -4.374194622039795, "logps/rejected": -5.159496784210205, "loss": 0.05, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.374194622039795, "rewards/margins": 0.7853022217750549, "rewards/rejected": -5.159496784210205, "sft_loss": 4.077193737030029, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 0.7570690390018039, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.2788086533546448, "logits/rejected": -0.1510922908782959, "logps/chosen": -4.137418746948242, "logps/rejected": -4.947587013244629, "loss": 0.0486, "rewards/accuracies": 0.71875, "rewards/chosen": -4.137418746948242, "rewards/margins": 0.8101680874824524, "rewards/rejected": -4.947587013244629, "sft_loss": 3.85473895072937, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 0.6922445687240398, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.3353812098503113, "logits/rejected": -0.20698490738868713, "logps/chosen": -4.306082248687744, "logps/rejected": -5.15787410736084, "loss": 0.0506, "rewards/accuracies": 0.75, "rewards/chosen": -4.306082248687744, "rewards/margins": 0.8517919778823853, "rewards/rejected": -5.15787410736084, "sft_loss": 4.074376106262207, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 0.5903877756187921, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.4253089427947998, "logits/rejected": -0.28170305490493774, "logps/chosen": -4.395491123199463, "logps/rejected": -5.1986823081970215, "loss": 0.0503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.395491123199463, "rewards/margins": 0.8031916618347168, "rewards/rejected": -5.1986823081970215, "sft_loss": 4.087505340576172, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 0.5932205535715832, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.40591302514076233, "logits/rejected": -0.23701974749565125, "logps/chosen": -4.1090288162231445, "logps/rejected": -4.979941368103027, "loss": 0.0509, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.1090288162231445, "rewards/margins": 0.8709122538566589, "rewards/rejected": -4.979941368103027, "sft_loss": 3.8675315380096436, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 0.37999548877256567, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.4235003590583801, "logits/rejected": -0.10297024250030518, "logps/chosen": -4.314941883087158, "logps/rejected": -5.224982738494873, "loss": 0.0501, "rewards/accuracies": 0.75, "rewards/chosen": -4.314941883087158, "rewards/margins": 0.9100410342216492, "rewards/rejected": -5.224982738494873, "sft_loss": 4.035270690917969, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 0.5848375242126514, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.23933526873588562, "logits/rejected": -0.15588855743408203, "logps/chosen": -4.382521629333496, "logps/rejected": -5.272342205047607, "loss": 0.0493, "rewards/accuracies": 0.71875, "rewards/chosen": -4.382521629333496, "rewards/margins": 0.88982093334198, "rewards/rejected": -5.272342205047607, "sft_loss": 4.00167179107666, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.19795073568820953, "eval_logits/rejected": 0.30290400981903076, "eval_logps/chosen": -4.395362377166748, "eval_logps/rejected": -5.153711795806885, "eval_loss": 0.04997352510690689, "eval_rewards/accuracies": 0.6839762330055237, "eval_rewards/chosen": -4.395362377166748, "eval_rewards/margins": 0.7583494782447815, "eval_rewards/rejected": -5.153711795806885, "eval_runtime": 45.0603, "eval_samples_per_second": 29.849, "eval_sft_loss": 3.9750618934631348, "eval_steps_per_second": 7.479, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 0.6938806035793248, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.2542392611503601, "logits/rejected": 0.00998584646731615, "logps/chosen": -4.379458427429199, "logps/rejected": -5.155462265014648, "loss": 0.0499, "rewards/accuracies": 0.71875, "rewards/chosen": -4.379458427429199, "rewards/margins": 0.7760039567947388, "rewards/rejected": -5.155462265014648, "sft_loss": 4.045472621917725, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 0.39509876824712153, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.3175296187400818, "logits/rejected": -0.20715491473674774, "logps/chosen": -4.360111236572266, "logps/rejected": -5.166841983795166, "loss": 0.0489, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.360111236572266, "rewards/margins": 0.8067308664321899, "rewards/rejected": -5.166841983795166, "sft_loss": 3.986266613006592, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 0.66114171413954, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.31307560205459595, "logits/rejected": -0.008983058854937553, "logps/chosen": -4.3839335441589355, "logps/rejected": -5.171870231628418, "loss": 0.0522, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.3839335441589355, "rewards/margins": 0.7879377603530884, "rewards/rejected": -5.171870231628418, "sft_loss": 4.155789375305176, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 0.3597383400103683, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.39556217193603516, "logits/rejected": -0.25675544142723083, "logps/chosen": -4.207012176513672, "logps/rejected": -5.153992652893066, "loss": 0.0488, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.207012176513672, "rewards/margins": 0.9469804763793945, "rewards/rejected": -5.153992652893066, "sft_loss": 3.9583792686462402, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 0.5681000111895875, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.2569750249385834, "logits/rejected": -0.15023109316825867, "logps/chosen": -4.433220386505127, "logps/rejected": -5.140045642852783, "loss": 0.0514, "rewards/accuracies": 0.6875, "rewards/chosen": -4.433220386505127, "rewards/margins": 0.7068256139755249, "rewards/rejected": -5.140045642852783, "sft_loss": 4.159654140472412, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 0.40353186945349734, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.3387718200683594, "logits/rejected": -0.278081476688385, "logps/chosen": -4.410946846008301, "logps/rejected": -5.245885372161865, "loss": 0.0485, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.410946846008301, "rewards/margins": 0.8349380493164062, "rewards/rejected": -5.245885372161865, "sft_loss": 3.995271682739258, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 0.38721194269692477, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.2565738260746002, "logits/rejected": -0.2947031259536743, "logps/chosen": -4.291686058044434, "logps/rejected": -4.931029796600342, "loss": 0.0505, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.291686058044434, "rewards/margins": 0.6393446922302246, "rewards/rejected": -4.931029796600342, "sft_loss": 3.965325117111206, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 0.44677508552918804, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.2852485775947571, "logits/rejected": -0.32384148240089417, "logps/chosen": -4.3653764724731445, "logps/rejected": -5.132533073425293, "loss": 0.0501, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.3653764724731445, "rewards/margins": 0.7671566009521484, "rewards/rejected": -5.132533073425293, "sft_loss": 4.074067115783691, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 0.5021427968737899, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.29809674620628357, "logits/rejected": -0.14820165932178497, "logps/chosen": -4.290149211883545, "logps/rejected": -5.1410932540893555, "loss": 0.0503, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.290149211883545, "rewards/margins": 0.8509443998336792, "rewards/rejected": -5.1410932540893555, "sft_loss": 4.025235176086426, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 0.5799504566757765, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.3963567912578583, "logits/rejected": -0.2322741001844406, "logps/chosen": -4.519765377044678, "logps/rejected": -5.176243782043457, "loss": 0.0544, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.519765377044678, "rewards/margins": 0.6564784049987793, "rewards/rejected": -5.176243782043457, "sft_loss": 4.351537704467773, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 0.8047811292467494, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.2328902930021286, "logits/rejected": -0.17223525047302246, "logps/chosen": -4.405119895935059, "logps/rejected": -4.998657703399658, "loss": 0.0513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.405119895935059, "rewards/margins": 0.593537449836731, "rewards/rejected": -4.998657703399658, "sft_loss": 4.159225940704346, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 0.5192972561919211, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.2871648371219635, "logits/rejected": -0.09363778680562973, "logps/chosen": -4.515463352203369, "logps/rejected": -5.090214729309082, "loss": 0.0532, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.515463352203369, "rewards/margins": 0.5747517347335815, "rewards/rejected": -5.090214729309082, "sft_loss": 4.21968936920166, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 0.4878248040983854, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.39410749077796936, "logits/rejected": -0.3096895217895508, "logps/chosen": -4.484684467315674, "logps/rejected": -5.331465721130371, "loss": 0.0505, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.484684467315674, "rewards/margins": 0.8467812538146973, "rewards/rejected": -5.331465721130371, "sft_loss": 4.192440986633301, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 0.514093539313008, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.314320832490921, "logits/rejected": -0.07475896179676056, "logps/chosen": -4.39267635345459, "logps/rejected": -5.261028289794922, "loss": 0.0502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.39267635345459, "rewards/margins": 0.8683518171310425, "rewards/rejected": -5.261028289794922, "sft_loss": 4.083760738372803, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 0.5910254433311389, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.33221128582954407, "logits/rejected": -0.12248317152261734, "logps/chosen": -4.258050918579102, "logps/rejected": -5.079594135284424, "loss": 0.0484, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.258050918579102, "rewards/margins": 0.8215433955192566, "rewards/rejected": -5.079594135284424, "sft_loss": 3.9350600242614746, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 0.44331518926666863, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.3698522448539734, "logits/rejected": -0.24156637489795685, "logps/chosen": -4.3765363693237305, "logps/rejected": -5.2998046875, "loss": 0.0497, "rewards/accuracies": 0.71875, "rewards/chosen": -4.3765363693237305, "rewards/margins": 0.9232684373855591, "rewards/rejected": -5.2998046875, "sft_loss": 4.07401704788208, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 0.4360409311344564, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.2376057207584381, "logits/rejected": -0.1950663924217224, "logps/chosen": -4.3534393310546875, "logps/rejected": -5.132110595703125, "loss": 0.0508, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.3534393310546875, "rewards/margins": 0.7786713242530823, "rewards/rejected": -5.132110595703125, "sft_loss": 4.0486626625061035, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 0.4611340496677084, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.42794519662857056, "logits/rejected": -0.22650738060474396, "logps/chosen": -4.329638481140137, "logps/rejected": -5.147965431213379, "loss": 0.0496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.329638481140137, "rewards/margins": 0.8183272480964661, "rewards/rejected": -5.147965431213379, "sft_loss": 3.998382568359375, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 0.5195705902795844, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.4067610800266266, "logits/rejected": -0.32761862874031067, "logps/chosen": -4.3563666343688965, "logps/rejected": -5.1253862380981445, "loss": 0.0497, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.3563666343688965, "rewards/margins": 0.769019603729248, "rewards/rejected": -5.1253862380981445, "sft_loss": 4.0309929847717285, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 0.7515484520612266, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.2933884561061859, "logits/rejected": -0.10745501518249512, "logps/chosen": -4.388808250427246, "logps/rejected": -5.196818828582764, "loss": 0.05, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.388808250427246, "rewards/margins": 0.8080108761787415, "rewards/rejected": -5.196818828582764, "sft_loss": 4.126204490661621, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 0.36793126931525066, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.3496021330356598, "logits/rejected": -0.12837150692939758, "logps/chosen": -4.346577167510986, "logps/rejected": -5.146768569946289, "loss": 0.0498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.346577167510986, "rewards/margins": 0.8001911044120789, "rewards/rejected": -5.146768569946289, "sft_loss": 4.027698040008545, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 0.5183161146135171, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.31241917610168457, "logits/rejected": -0.09111049026250839, "logps/chosen": -4.2632646560668945, "logps/rejected": -5.173994064331055, "loss": 0.0496, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.2632646560668945, "rewards/margins": 0.910729706287384, "rewards/rejected": -5.173994064331055, "sft_loss": 3.9972128868103027, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 0.960953323451343, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.2670236825942993, "logits/rejected": -0.18157057464122772, "logps/chosen": -4.247032642364502, "logps/rejected": -5.117427825927734, "loss": 0.05, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.247032642364502, "rewards/margins": 0.8703948855400085, "rewards/rejected": -5.117427825927734, "sft_loss": 3.9730453491210938, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 0.9825229794639762, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.32241255044937134, "logits/rejected": -0.21362285315990448, "logps/chosen": -4.210694313049316, "logps/rejected": -4.990067481994629, "loss": 0.0508, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.210694313049316, "rewards/margins": 0.779373824596405, "rewards/rejected": -4.990067481994629, "sft_loss": 3.9055423736572266, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 0.487116120875464, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.3949047327041626, "logits/rejected": -0.17741958796977997, "logps/chosen": -4.219511032104492, "logps/rejected": -5.2257795333862305, "loss": 0.049, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.219511032104492, "rewards/margins": 1.0062682628631592, "rewards/rejected": -5.2257795333862305, "sft_loss": 3.941300630569458, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 0.4971791224235736, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.26797419786453247, "logits/rejected": -0.17251023650169373, "logps/chosen": -4.320847034454346, "logps/rejected": -5.197128772735596, "loss": 0.0508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.320847034454346, "rewards/margins": 0.8762819170951843, "rewards/rejected": -5.197128772735596, "sft_loss": 4.089552879333496, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 0.6298909129841176, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.20016708970069885, "logits/rejected": -0.24042925238609314, "logps/chosen": -4.610543727874756, "logps/rejected": -5.33798885345459, "loss": 0.0527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.610543727874756, "rewards/margins": 0.727445125579834, "rewards/rejected": -5.33798885345459, "sft_loss": 4.340829372406006, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 0.433532923165645, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.4819692075252533, "logits/rejected": -0.42997080087661743, "logps/chosen": -4.405976295471191, "logps/rejected": -5.104135990142822, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.405976295471191, "rewards/margins": 0.6981590986251831, "rewards/rejected": -5.104135990142822, "sft_loss": 4.199334144592285, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 0.5595706664938976, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.23996420204639435, "logits/rejected": -0.0859023854136467, "logps/chosen": -4.4849348068237305, "logps/rejected": -5.22530460357666, "loss": 0.0511, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.4849348068237305, "rewards/margins": 0.7403702735900879, "rewards/rejected": -5.22530460357666, "sft_loss": 4.229413032531738, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 0.47235470682786457, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.36756059527397156, "logits/rejected": -0.25558412075042725, "logps/chosen": -4.3525214195251465, "logps/rejected": -5.156813621520996, "loss": 0.0504, "rewards/accuracies": 0.71875, "rewards/chosen": -4.3525214195251465, "rewards/margins": 0.8042919039726257, "rewards/rejected": -5.156813621520996, "sft_loss": 4.11196231842041, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 0.5165025523535854, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.3705408275127411, "logits/rejected": -0.38917452096939087, "logps/chosen": -4.170284271240234, "logps/rejected": -4.961236000061035, "loss": 0.0493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.170284271240234, "rewards/margins": 0.790952205657959, "rewards/rejected": -4.961236000061035, "sft_loss": 3.8401451110839844, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 0.6027686935337975, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.3659024238586426, "logits/rejected": -0.23701664805412292, "logps/chosen": -4.248166561126709, "logps/rejected": -5.038237571716309, "loss": 0.0504, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.248166561126709, "rewards/margins": 0.7900711894035339, "rewards/rejected": -5.038237571716309, "sft_loss": 3.9721550941467285, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 0.7255411038290942, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.35837799310684204, "logits/rejected": -0.22272975742816925, "logps/chosen": -4.344782829284668, "logps/rejected": -5.208219051361084, "loss": 0.0506, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.344782829284668, "rewards/margins": 0.8634363412857056, "rewards/rejected": -5.208219051361084, "sft_loss": 4.040823936462402, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 0.6179482287804432, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.17336727678775787, "logits/rejected": -0.10471125692129135, "logps/chosen": -4.413411617279053, "logps/rejected": -5.257023334503174, "loss": 0.051, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.413411617279053, "rewards/margins": 0.8436113595962524, "rewards/rejected": -5.257023334503174, "sft_loss": 4.22342586517334, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 0.9128972719000168, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.32853183150291443, "logits/rejected": -0.23889176547527313, "logps/chosen": -4.317111015319824, "logps/rejected": -5.020560264587402, "loss": 0.0511, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.317111015319824, "rewards/margins": 0.703448474407196, "rewards/rejected": -5.020560264587402, "sft_loss": 4.063889503479004, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 0.570277555661351, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.37603339552879333, "logits/rejected": -0.12415879964828491, "logps/chosen": -4.372494697570801, "logps/rejected": -5.246346950531006, "loss": 0.051, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.372494697570801, "rewards/margins": 0.8738527297973633, "rewards/rejected": -5.246346950531006, "sft_loss": 4.095280647277832, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 0.5063146939111651, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.33638912439346313, "logits/rejected": -0.32763671875, "logps/chosen": -4.277950286865234, "logps/rejected": -5.035035133361816, "loss": 0.049, "rewards/accuracies": 0.71875, "rewards/chosen": -4.277950286865234, "rewards/margins": 0.757083535194397, "rewards/rejected": -5.035035133361816, "sft_loss": 3.931530714035034, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 0.5668311239347966, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.17369453608989716, "logits/rejected": -0.08982907980680466, "logps/chosen": -4.383388519287109, "logps/rejected": -5.2250165939331055, "loss": 0.0517, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.383388519287109, "rewards/margins": 0.8416286706924438, "rewards/rejected": -5.2250165939331055, "sft_loss": 4.108694553375244, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 0.4524352019006803, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.462289422750473, "logits/rejected": -0.32472777366638184, "logps/chosen": -4.465978145599365, "logps/rejected": -5.1302995681762695, "loss": 0.0505, "rewards/accuracies": 0.6875, "rewards/chosen": -4.465978145599365, "rewards/margins": 0.6643209457397461, "rewards/rejected": -5.1302995681762695, "sft_loss": 4.117940902709961, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 0.7152486203815478, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.17068356275558472, "logits/rejected": -0.21633978188037872, "logps/chosen": -4.524289131164551, "logps/rejected": -5.111501216888428, "loss": 0.0535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.524289131164551, "rewards/margins": 0.5872117877006531, "rewards/rejected": -5.111501216888428, "sft_loss": 4.23223352432251, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 0.4968827477148222, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.39958643913269043, "logits/rejected": -0.3283675014972687, "logps/chosen": -4.347672462463379, "logps/rejected": -5.024527072906494, "loss": 0.0508, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.347672462463379, "rewards/margins": 0.6768544912338257, "rewards/rejected": -5.024527072906494, "sft_loss": 3.984272003173828, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 0.6034610216258647, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.35456377267837524, "logits/rejected": -0.23129554092884064, "logps/chosen": -4.35485315322876, "logps/rejected": -5.382556438446045, "loss": 0.0501, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.35485315322876, "rewards/margins": 1.0277034044265747, "rewards/rejected": -5.382556438446045, "sft_loss": 4.1364922523498535, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 0.7072110788697915, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.34522002935409546, "logits/rejected": -0.18638448417186737, "logps/chosen": -4.422163963317871, "logps/rejected": -5.283125400543213, "loss": 0.0513, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.422163963317871, "rewards/margins": 0.8609609603881836, "rewards/rejected": -5.283125400543213, "sft_loss": 4.139298915863037, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 0.45836262120476573, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.41230830550193787, "logits/rejected": -0.20210537314414978, "logps/chosen": -4.222951889038086, "logps/rejected": -5.2063188552856445, "loss": 0.0487, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.222951889038086, "rewards/margins": 0.9833674430847168, "rewards/rejected": -5.2063188552856445, "sft_loss": 4.025243282318115, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 0.3997972564343064, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.41125649213790894, "logits/rejected": -0.12870237231254578, "logps/chosen": -4.29024600982666, "logps/rejected": -5.098462104797363, "loss": 0.0514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.29024600982666, "rewards/margins": 0.8082154393196106, "rewards/rejected": -5.098462104797363, "sft_loss": 4.028970718383789, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 0.4035893113625815, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.3744090795516968, "logits/rejected": -0.30211079120635986, "logps/chosen": -4.280365467071533, "logps/rejected": -5.217028617858887, "loss": 0.0488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.280365467071533, "rewards/margins": 0.9366633296012878, "rewards/rejected": -5.217028617858887, "sft_loss": 3.9095451831817627, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 0.6207283136955817, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.34626153111457825, "logits/rejected": -0.2757788300514221, "logps/chosen": -4.46042537689209, "logps/rejected": -5.151759147644043, "loss": 0.0504, "rewards/accuracies": 0.6875, "rewards/chosen": -4.46042537689209, "rewards/margins": 0.691334068775177, "rewards/rejected": -5.151759147644043, "sft_loss": 4.131369590759277, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 0.48534289801784386, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.3493006229400635, "logits/rejected": -0.25627556443214417, "logps/chosen": -4.3235392570495605, "logps/rejected": -5.203847885131836, "loss": 0.0491, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.3235392570495605, "rewards/margins": 0.8803087472915649, "rewards/rejected": -5.203847885131836, "sft_loss": 4.0077996253967285, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 0.4853526836957504, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.3779717683792114, "logits/rejected": -0.22099003195762634, "logps/chosen": -4.265778541564941, "logps/rejected": -5.1664605140686035, "loss": 0.0493, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.265778541564941, "rewards/margins": 0.90068119764328, "rewards/rejected": -5.1664605140686035, "sft_loss": 3.9554412364959717, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 0.37946669743407174, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.37565523386001587, "logits/rejected": -0.16238412261009216, "logps/chosen": -4.434185981750488, "logps/rejected": -5.394456386566162, "loss": 0.0486, "rewards/accuracies": 0.78125, "rewards/chosen": -4.434185981750488, "rewards/margins": 0.9602702856063843, "rewards/rejected": -5.394456386566162, "sft_loss": 4.023019313812256, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 0.5047515792520335, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.40330666303634644, "logits/rejected": -0.1070922389626503, "logps/chosen": -4.3636579513549805, "logps/rejected": -5.487493991851807, "loss": 0.049, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.3636579513549805, "rewards/margins": 1.1238361597061157, "rewards/rejected": -5.487493991851807, "sft_loss": 3.9812254905700684, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 0.49986291771664965, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.3085281252861023, "logits/rejected": -0.23384499549865723, "logps/chosen": -4.470911979675293, "logps/rejected": -5.105937957763672, "loss": 0.0522, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.470911979675293, "rewards/margins": 0.6350253820419312, "rewards/rejected": -5.105937957763672, "sft_loss": 4.192760467529297, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 0.419551409827489, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.45216822624206543, "logits/rejected": -0.2786790728569031, "logps/chosen": -4.20875883102417, "logps/rejected": -5.2486252784729, "loss": 0.0478, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.20875883102417, "rewards/margins": 1.03986656665802, "rewards/rejected": -5.2486252784729, "sft_loss": 3.9220237731933594, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 0.672454413679323, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.3137684464454651, "logits/rejected": -0.031981952488422394, "logps/chosen": -4.239866256713867, "logps/rejected": -5.073857307434082, "loss": 0.0506, "rewards/accuracies": 0.65625, "rewards/chosen": -4.239866256713867, "rewards/margins": 0.8339906930923462, "rewards/rejected": -5.073857307434082, "sft_loss": 3.968554973602295, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 0.47486382919668607, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.34231531620025635, "logits/rejected": -0.23783119022846222, "logps/chosen": -4.323024749755859, "logps/rejected": -5.134915828704834, "loss": 0.0506, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.323024749755859, "rewards/margins": 0.8118915557861328, "rewards/rejected": -5.134915828704834, "sft_loss": 4.072056770324707, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 0.8756257355383923, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.41580209136009216, "logits/rejected": -0.3289106488227844, "logps/chosen": -4.2255096435546875, "logps/rejected": -5.234991550445557, "loss": 0.0488, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.2255096435546875, "rewards/margins": 1.009481430053711, "rewards/rejected": -5.234991550445557, "sft_loss": 3.9910926818847656, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 0.5727277350002951, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.341047465801239, "logits/rejected": -0.3902260363101959, "logps/chosen": -4.409988880157471, "logps/rejected": -5.2129974365234375, "loss": 0.0505, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.409988880157471, "rewards/margins": 0.8030092120170593, "rewards/rejected": -5.2129974365234375, "sft_loss": 4.154358863830566, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 0.5076883038357715, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.4511869549751282, "logits/rejected": -0.26104557514190674, "logps/chosen": -4.3491363525390625, "logps/rejected": -5.162402153015137, "loss": 0.05, "rewards/accuracies": 0.78125, "rewards/chosen": -4.3491363525390625, "rewards/margins": 0.8132661581039429, "rewards/rejected": -5.162402153015137, "sft_loss": 4.109154224395752, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 0.4023153185094249, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.43699079751968384, "logits/rejected": -0.3607856035232544, "logps/chosen": -4.445807456970215, "logps/rejected": -5.105084419250488, "loss": 0.0518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.445807456970215, "rewards/margins": 0.6592772603034973, "rewards/rejected": -5.105084419250488, "sft_loss": 4.227627754211426, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 0.47451049109887844, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.33742186427116394, "logits/rejected": -0.22219757735729218, "logps/chosen": -4.269178867340088, "logps/rejected": -4.994236469268799, "loss": 0.0499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.269178867340088, "rewards/margins": 0.7250576019287109, "rewards/rejected": -4.994236469268799, "sft_loss": 3.927827835083008, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 0.537249755204217, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.4085536003112793, "logits/rejected": -0.2702638506889343, "logps/chosen": -4.376568794250488, "logps/rejected": -5.194339752197266, "loss": 0.0506, "rewards/accuracies": 0.6875, "rewards/chosen": -4.376568794250488, "rewards/margins": 0.817771315574646, "rewards/rejected": -5.194339752197266, "sft_loss": 4.102890968322754, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 0.4232606940574936, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.42444998025894165, "logits/rejected": -0.20567944645881653, "logps/chosen": -4.219876289367676, "logps/rejected": -5.182650566101074, "loss": 0.0495, "rewards/accuracies": 0.75, "rewards/chosen": -4.219876289367676, "rewards/margins": 0.9627736210823059, "rewards/rejected": -5.182650566101074, "sft_loss": 3.9575088024139404, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 0.6045181843568596, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.42669230699539185, "logits/rejected": -0.246909499168396, "logps/chosen": -4.394600868225098, "logps/rejected": -5.036022186279297, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.394600868225098, "rewards/margins": 0.6414215564727783, "rewards/rejected": -5.036022186279297, "sft_loss": 4.132719039916992, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 0.5688567714557625, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.4220002591609955, "logits/rejected": -0.2799118161201477, "logps/chosen": -4.280217170715332, "logps/rejected": -5.047442436218262, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.280217170715332, "rewards/margins": 0.7672249674797058, "rewards/rejected": -5.047442436218262, "sft_loss": 3.9711861610412598, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 0.6394676393882484, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.4407338500022888, "logits/rejected": -0.241370290517807, "logps/chosen": -4.418463706970215, "logps/rejected": -5.3320698738098145, "loss": 0.0513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.418463706970215, "rewards/margins": 0.9136059880256653, "rewards/rejected": -5.3320698738098145, "sft_loss": 4.146149635314941, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 0.47999784311707305, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.4439243674278259, "logits/rejected": -0.3043748438358307, "logps/chosen": -4.377346992492676, "logps/rejected": -5.098249912261963, "loss": 0.0505, "rewards/accuracies": 0.71875, "rewards/chosen": -4.377346992492676, "rewards/margins": 0.7209030389785767, "rewards/rejected": -5.098249912261963, "sft_loss": 4.071615695953369, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 0.3861928086843125, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.4043838083744049, "logits/rejected": -0.31296491622924805, "logps/chosen": -4.497230529785156, "logps/rejected": -5.2701849937438965, "loss": 0.0503, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.497230529785156, "rewards/margins": 0.7729544639587402, "rewards/rejected": -5.2701849937438965, "sft_loss": 4.045097827911377, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 0.51884864533075, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.4912940561771393, "logits/rejected": -0.2375914603471756, "logps/chosen": -4.4356689453125, "logps/rejected": -5.331031799316406, "loss": 0.0503, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.4356689453125, "rewards/margins": 0.8953633308410645, "rewards/rejected": -5.331031799316406, "sft_loss": 4.134769916534424, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 0.6199903820759686, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.4292561113834381, "logits/rejected": -0.3562420606613159, "logps/chosen": -4.502501010894775, "logps/rejected": -5.053229331970215, "loss": 0.0516, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.502501010894775, "rewards/margins": 0.5507287383079529, "rewards/rejected": -5.053229331970215, "sft_loss": 4.21864652633667, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 0.3695485210076071, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.4103737473487854, "logits/rejected": -0.3381521999835968, "logps/chosen": -4.395519256591797, "logps/rejected": -5.27461576461792, "loss": 0.0499, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.395519256591797, "rewards/margins": 0.8790962100028992, "rewards/rejected": -5.27461576461792, "sft_loss": 4.1917924880981445, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 0.4610615645602974, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.5034217238426208, "logits/rejected": -0.34082144498825073, "logps/chosen": -4.428948402404785, "logps/rejected": -5.152653694152832, "loss": 0.0513, "rewards/accuracies": 0.71875, "rewards/chosen": -4.428948402404785, "rewards/margins": 0.723704993724823, "rewards/rejected": -5.152653694152832, "sft_loss": 4.1761956214904785, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 0.6469321968896236, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.45382094383239746, "logits/rejected": -0.18945330381393433, "logps/chosen": -4.2880859375, "logps/rejected": -5.107758045196533, "loss": 0.0496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.2880859375, "rewards/margins": 0.819671630859375, "rewards/rejected": -5.107758045196533, "sft_loss": 3.9440059661865234, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 0.46564406778831696, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.370443195104599, "logits/rejected": -0.3640519678592682, "logps/chosen": -4.384427547454834, "logps/rejected": -5.047237396240234, "loss": 0.0531, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.384427547454834, "rewards/margins": 0.6628104448318481, "rewards/rejected": -5.047237396240234, "sft_loss": 4.172974109649658, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 0.501041684548939, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.3562886416912079, "logits/rejected": -0.22665898501873016, "logps/chosen": -4.420121669769287, "logps/rejected": -5.273455619812012, "loss": 0.0506, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.420121669769287, "rewards/margins": 0.8533342480659485, "rewards/rejected": -5.273455619812012, "sft_loss": 4.2023725509643555, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 0.44752436228931725, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.507021963596344, "logits/rejected": -0.23198696970939636, "logps/chosen": -4.363643169403076, "logps/rejected": -5.239960670471191, "loss": 0.0511, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.363643169403076, "rewards/margins": 0.8763176798820496, "rewards/rejected": -5.239960670471191, "sft_loss": 4.179306983947754, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 0.4764172344332767, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.46979475021362305, "logits/rejected": -0.2243788242340088, "logps/chosen": -4.079625129699707, "logps/rejected": -4.999655723571777, "loss": 0.0494, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.079625129699707, "rewards/margins": 0.9200307726860046, "rewards/rejected": -4.999655723571777, "sft_loss": 3.936373233795166, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 0.6228198121848917, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.49106574058532715, "logits/rejected": -0.3500491976737976, "logps/chosen": -4.392767429351807, "logps/rejected": -5.033437728881836, "loss": 0.0509, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.392767429351807, "rewards/margins": 0.6406702995300293, "rewards/rejected": -5.033437728881836, "sft_loss": 4.037944316864014, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 0.4087330269846568, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.4462736248970032, "logits/rejected": -0.3225329518318176, "logps/chosen": -4.157266139984131, "logps/rejected": -5.061835289001465, "loss": 0.0495, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.157266139984131, "rewards/margins": 0.9045697450637817, "rewards/rejected": -5.061835289001465, "sft_loss": 3.9843857288360596, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 0.8124937399329678, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.38373181223869324, "logits/rejected": -0.19799697399139404, "logps/chosen": -4.203919887542725, "logps/rejected": -5.222175598144531, "loss": 0.0502, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.203919887542725, "rewards/margins": 1.0182548761367798, "rewards/rejected": -5.222175598144531, "sft_loss": 3.9921231269836426, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 0.42055836896384047, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.43498653173446655, "logits/rejected": -0.27146443724632263, "logps/chosen": -4.396726131439209, "logps/rejected": -5.123176574707031, "loss": 0.0522, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.396726131439209, "rewards/margins": 0.7264498472213745, "rewards/rejected": -5.123176574707031, "sft_loss": 4.128332614898682, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.11306402832269669, "eval_logits/rejected": 0.2139400690793991, "eval_logps/chosen": -4.401327133178711, "eval_logps/rejected": -5.163181781768799, "eval_loss": 0.049954961985349655, "eval_rewards/accuracies": 0.6869435906410217, "eval_rewards/chosen": -4.401327133178711, "eval_rewards/margins": 0.7618544697761536, "eval_rewards/rejected": -5.163181781768799, "eval_runtime": 44.8695, "eval_samples_per_second": 29.976, "eval_sft_loss": 3.981999635696411, "eval_steps_per_second": 7.511, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 0.5317123135345402, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.4011858105659485, "logits/rejected": -0.4122949242591858, "logps/chosen": -4.417490005493164, "logps/rejected": -5.078069686889648, "loss": 0.0514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.417490005493164, "rewards/margins": 0.6605796813964844, "rewards/rejected": -5.078069686889648, "sft_loss": 4.199349403381348, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 0.4186884267154328, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.34007394313812256, "logits/rejected": -0.25113579630851746, "logps/chosen": -4.509642124176025, "logps/rejected": -5.317172050476074, "loss": 0.0512, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.509642124176025, "rewards/margins": 0.8075307607650757, "rewards/rejected": -5.317172050476074, "sft_loss": 4.248913764953613, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 0.6066617953485327, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.394034206867218, "logits/rejected": -0.20599737763404846, "logps/chosen": -4.279552459716797, "logps/rejected": -5.119963645935059, "loss": 0.0503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.279552459716797, "rewards/margins": 0.8404117822647095, "rewards/rejected": -5.119963645935059, "sft_loss": 3.9960410594940186, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 0.43807345354435145, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.392520010471344, "logits/rejected": -0.2974608838558197, "logps/chosen": -4.408247470855713, "logps/rejected": -5.05702543258667, "loss": 0.0511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.408247470855713, "rewards/margins": 0.6487780809402466, "rewards/rejected": -5.05702543258667, "sft_loss": 4.111867427825928, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 0.3919286648470138, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.3907695710659027, "logits/rejected": -0.32728058099746704, "logps/chosen": -4.114865303039551, "logps/rejected": -4.888617515563965, "loss": 0.0516, "rewards/accuracies": 0.71875, "rewards/chosen": -4.114865303039551, "rewards/margins": 0.7737522125244141, "rewards/rejected": -4.888617515563965, "sft_loss": 3.919823408126831, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 0.42910296321292596, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.4506562352180481, "logits/rejected": -0.2691783607006073, "logps/chosen": -4.15285587310791, "logps/rejected": -5.211766719818115, "loss": 0.0474, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.15285587310791, "rewards/margins": 1.0589115619659424, "rewards/rejected": -5.211766719818115, "sft_loss": 3.842996120452881, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 0.5039120295826071, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.42511478066444397, "logits/rejected": -0.2042577713727951, "logps/chosen": -4.339873790740967, "logps/rejected": -5.093599319458008, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.339873790740967, "rewards/margins": 0.7537254095077515, "rewards/rejected": -5.093599319458008, "sft_loss": 4.0721588134765625, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 0.4743203519633735, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.5358638763427734, "logits/rejected": -0.219042107462883, "logps/chosen": -4.194911956787109, "logps/rejected": -5.071628570556641, "loss": 0.0496, "rewards/accuracies": 0.8125, "rewards/chosen": -4.194911956787109, "rewards/margins": 0.8767167925834656, "rewards/rejected": -5.071628570556641, "sft_loss": 3.974562168121338, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 0.5337490751429477, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.43740472197532654, "logits/rejected": -0.1700814664363861, "logps/chosen": -4.525032043457031, "logps/rejected": -5.163545608520508, "loss": 0.0503, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.525032043457031, "rewards/margins": 0.6385140419006348, "rewards/rejected": -5.163545608520508, "sft_loss": 4.158745765686035, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 0.6497343103889583, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.3816669285297394, "logits/rejected": -0.23741519451141357, "logps/chosen": -4.496787071228027, "logps/rejected": -5.277434349060059, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.496787071228027, "rewards/margins": 0.7806466221809387, "rewards/rejected": -5.277434349060059, "sft_loss": 4.196100234985352, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 0.5955899070785665, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.4977712631225586, "logits/rejected": -0.3810668885707855, "logps/chosen": -4.355957508087158, "logps/rejected": -5.177579879760742, "loss": 0.05, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.355957508087158, "rewards/margins": 0.8216217756271362, "rewards/rejected": -5.177579879760742, "sft_loss": 4.06763219833374, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 0.7554544600132199, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.5054645538330078, "logits/rejected": -0.2624374032020569, "logps/chosen": -4.189089298248291, "logps/rejected": -5.132569789886475, "loss": 0.0507, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.189089298248291, "rewards/margins": 0.9434806108474731, "rewards/rejected": -5.132569789886475, "sft_loss": 3.9219775199890137, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 0.4472598222234297, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.45849332213401794, "logits/rejected": -0.2598671615123749, "logps/chosen": -4.236002445220947, "logps/rejected": -5.057923316955566, "loss": 0.0493, "rewards/accuracies": 0.71875, "rewards/chosen": -4.236002445220947, "rewards/margins": 0.8219209909439087, "rewards/rejected": -5.057923316955566, "sft_loss": 3.9211764335632324, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 0.5938477095529445, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.3556726574897766, "logits/rejected": -0.27085644006729126, "logps/chosen": -4.353777885437012, "logps/rejected": -5.105501651763916, "loss": 0.0513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.353777885437012, "rewards/margins": 0.751723051071167, "rewards/rejected": -5.105501651763916, "sft_loss": 4.091824531555176, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 0.46956475804542436, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.4694312512874603, "logits/rejected": -0.23797054588794708, "logps/chosen": -4.494842529296875, "logps/rejected": -5.201089382171631, "loss": 0.0504, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.494842529296875, "rewards/margins": 0.7062473297119141, "rewards/rejected": -5.201089382171631, "sft_loss": 4.138050079345703, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 0.46317836970348397, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.5320299863815308, "logits/rejected": -0.3951790928840637, "logps/chosen": -4.233944416046143, "logps/rejected": -5.062338829040527, "loss": 0.0491, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.233944416046143, "rewards/margins": 0.8283944129943848, "rewards/rejected": -5.062338829040527, "sft_loss": 3.9799435138702393, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 0.500716513352662, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.36238226294517517, "logits/rejected": -0.28217214345932007, "logps/chosen": -4.403859615325928, "logps/rejected": -5.291438102722168, "loss": 0.0505, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.403859615325928, "rewards/margins": 0.8875784873962402, "rewards/rejected": -5.291438102722168, "sft_loss": 4.151859760284424, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 0.4413271451808516, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.46627339720726013, "logits/rejected": -0.21761274337768555, "logps/chosen": -4.465592384338379, "logps/rejected": -5.2193803787231445, "loss": 0.0517, "rewards/accuracies": 0.71875, "rewards/chosen": -4.465592384338379, "rewards/margins": 0.7537881135940552, "rewards/rejected": -5.2193803787231445, "sft_loss": 4.165318965911865, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 0.5698078550422322, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.5476104021072388, "logits/rejected": -0.30981510877609253, "logps/chosen": -4.382649898529053, "logps/rejected": -5.232707500457764, "loss": 0.0497, "rewards/accuracies": 0.71875, "rewards/chosen": -4.382649898529053, "rewards/margins": 0.8500572443008423, "rewards/rejected": -5.232707500457764, "sft_loss": 4.069231033325195, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 0.7446967723465798, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.36129865050315857, "logits/rejected": -0.34973570704460144, "logps/chosen": -4.401464939117432, "logps/rejected": -5.0461273193359375, "loss": 0.051, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.401464939117432, "rewards/margins": 0.6446620225906372, "rewards/rejected": -5.0461273193359375, "sft_loss": 4.061834812164307, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 0.44889912577423824, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.4160916805267334, "logits/rejected": -0.18251463770866394, "logps/chosen": -4.292811393737793, "logps/rejected": -5.121218681335449, "loss": 0.0511, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.292811393737793, "rewards/margins": 0.8284076452255249, "rewards/rejected": -5.121218681335449, "sft_loss": 3.972756862640381, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 0.5480919233036169, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.4768191874027252, "logits/rejected": -0.3089585602283478, "logps/chosen": -4.455610752105713, "logps/rejected": -5.065629005432129, "loss": 0.0522, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.455610752105713, "rewards/margins": 0.6100180745124817, "rewards/rejected": -5.065629005432129, "sft_loss": 4.14192533493042, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 0.6830158913780228, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.4352652132511139, "logits/rejected": -0.38772186636924744, "logps/chosen": -4.220919609069824, "logps/rejected": -5.060526371002197, "loss": 0.0481, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.220919609069824, "rewards/margins": 0.8396071195602417, "rewards/rejected": -5.060526371002197, "sft_loss": 3.8675715923309326, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 0.5852249757949145, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.45020991563796997, "logits/rejected": -0.32614919543266296, "logps/chosen": -4.4303412437438965, "logps/rejected": -5.190096378326416, "loss": 0.0503, "rewards/accuracies": 0.71875, "rewards/chosen": -4.4303412437438965, "rewards/margins": 0.7597540616989136, "rewards/rejected": -5.190096378326416, "sft_loss": 4.109742164611816, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 0.4980895649611331, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.4938434660434723, "logits/rejected": -0.2938065826892853, "logps/chosen": -4.283869743347168, "logps/rejected": -5.195856094360352, "loss": 0.0492, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.283869743347168, "rewards/margins": 0.9119867086410522, "rewards/rejected": -5.195856094360352, "sft_loss": 4.028626441955566, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 0.9526004589440363, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.5190502405166626, "logits/rejected": -0.43536219000816345, "logps/chosen": -4.259387493133545, "logps/rejected": -5.101330280303955, "loss": 0.0513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.259387493133545, "rewards/margins": 0.8419429659843445, "rewards/rejected": -5.101330280303955, "sft_loss": 4.020884037017822, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 0.595068276479099, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.36445021629333496, "logits/rejected": -0.2759942412376404, "logps/chosen": -4.400937080383301, "logps/rejected": -5.269299507141113, "loss": 0.0515, "rewards/accuracies": 0.6875, "rewards/chosen": -4.400937080383301, "rewards/margins": 0.8683616518974304, "rewards/rejected": -5.269299507141113, "sft_loss": 4.176723957061768, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 0.45636661581557164, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.4203532636165619, "logits/rejected": -0.3165586590766907, "logps/chosen": -4.449545383453369, "logps/rejected": -5.1142096519470215, "loss": 0.0524, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.449545383453369, "rewards/margins": 0.6646645069122314, "rewards/rejected": -5.1142096519470215, "sft_loss": 4.189427375793457, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 0.37891154011258427, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.3171940743923187, "logits/rejected": -0.21769234538078308, "logps/chosen": -4.44630241394043, "logps/rejected": -5.259999752044678, "loss": 0.0498, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.44630241394043, "rewards/margins": 0.8136976957321167, "rewards/rejected": -5.259999752044678, "sft_loss": 4.108068943023682, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 0.6043546626443211, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.4760221838951111, "logits/rejected": -0.37830227613449097, "logps/chosen": -4.349831581115723, "logps/rejected": -5.125363826751709, "loss": 0.0512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.349831581115723, "rewards/margins": 0.7755329012870789, "rewards/rejected": -5.125363826751709, "sft_loss": 4.081582069396973, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 0.8255678355187496, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.41388097405433655, "logits/rejected": -0.33308395743370056, "logps/chosen": -4.39818811416626, "logps/rejected": -4.987654209136963, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.39818811416626, "rewards/margins": 0.5894662141799927, "rewards/rejected": -4.987654209136963, "sft_loss": 4.104317665100098, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 0.5862689384145306, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.49319133162498474, "logits/rejected": -0.22184018790721893, "logps/chosen": -4.365403175354004, "logps/rejected": -5.162243366241455, "loss": 0.0513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.365403175354004, "rewards/margins": 0.7968395352363586, "rewards/rejected": -5.162243366241455, "sft_loss": 4.030394554138184, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 0.5586358557159208, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.41878741979599, "logits/rejected": -0.3136092722415924, "logps/chosen": -4.293999671936035, "logps/rejected": -5.158097743988037, "loss": 0.049, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.293999671936035, "rewards/margins": 0.8640983700752258, "rewards/rejected": -5.158097743988037, "sft_loss": 3.9552371501922607, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 0.6768516428300098, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.4009891152381897, "logits/rejected": -0.22887440025806427, "logps/chosen": -4.390179634094238, "logps/rejected": -5.233609199523926, "loss": 0.0508, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.390179634094238, "rewards/margins": 0.8434289693832397, "rewards/rejected": -5.233609199523926, "sft_loss": 4.147332191467285, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 0.39904184672015913, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.33844825625419617, "logits/rejected": -0.28487923741340637, "logps/chosen": -4.381079196929932, "logps/rejected": -5.2011590003967285, "loss": 0.0511, "rewards/accuracies": 0.65625, "rewards/chosen": -4.381079196929932, "rewards/margins": 0.8200796246528625, "rewards/rejected": -5.2011590003967285, "sft_loss": 4.086765766143799, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 0.6386156784622389, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.3708297312259674, "logits/rejected": -0.315926194190979, "logps/chosen": -4.607503414154053, "logps/rejected": -5.189393520355225, "loss": 0.0515, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.607503414154053, "rewards/margins": 0.5818904638290405, "rewards/rejected": -5.189393520355225, "sft_loss": 4.188039302825928, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 0.7690041418422596, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.3973682224750519, "logits/rejected": -0.2335738241672516, "logps/chosen": -4.327630519866943, "logps/rejected": -5.160770416259766, "loss": 0.05, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.327630519866943, "rewards/margins": 0.8331397771835327, "rewards/rejected": -5.160770416259766, "sft_loss": 4.095370769500732, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 0.7700375694598965, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.4294312596321106, "logits/rejected": -0.28892675042152405, "logps/chosen": -4.411342620849609, "logps/rejected": -5.047314167022705, "loss": 0.0523, "rewards/accuracies": 0.65625, "rewards/chosen": -4.411342620849609, "rewards/margins": 0.6359715461730957, "rewards/rejected": -5.047314167022705, "sft_loss": 4.160157203674316, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 0.40676704281363285, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.4509078562259674, "logits/rejected": -0.3071804940700531, "logps/chosen": -4.243309497833252, "logps/rejected": -5.143277168273926, "loss": 0.0508, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.243309497833252, "rewards/margins": 0.899968147277832, "rewards/rejected": -5.143277168273926, "sft_loss": 3.960758924484253, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 0.598721559427233, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.4274400770664215, "logits/rejected": -0.17729714512825012, "logps/chosen": -4.204309940338135, "logps/rejected": -5.192355155944824, "loss": 0.0494, "rewards/accuracies": 0.78125, "rewards/chosen": -4.204309940338135, "rewards/margins": 0.9880453944206238, "rewards/rejected": -5.192355155944824, "sft_loss": 3.941906690597534, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 0.7290877157371126, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.42297524213790894, "logits/rejected": -0.22855360805988312, "logps/chosen": -4.441008567810059, "logps/rejected": -5.380014419555664, "loss": 0.0495, "rewards/accuracies": 0.71875, "rewards/chosen": -4.441008567810059, "rewards/margins": 0.9390062093734741, "rewards/rejected": -5.380014419555664, "sft_loss": 4.100342273712158, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 0.5176782637957663, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.46048182249069214, "logits/rejected": -0.2981131970882416, "logps/chosen": -4.4200263023376465, "logps/rejected": -5.22593355178833, "loss": 0.0509, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.4200263023376465, "rewards/margins": 0.8059074282646179, "rewards/rejected": -5.22593355178833, "sft_loss": 4.118147850036621, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 0.5672219301450849, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.4102095067501068, "logits/rejected": -0.22788353264331818, "logps/chosen": -4.337491989135742, "logps/rejected": -5.260950088500977, "loss": 0.0507, "rewards/accuracies": 0.75, "rewards/chosen": -4.337491989135742, "rewards/margins": 0.9234585762023926, "rewards/rejected": -5.260950088500977, "sft_loss": 4.1589765548706055, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 0.4917175367313671, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.39620310068130493, "logits/rejected": -0.20798726379871368, "logps/chosen": -4.377896308898926, "logps/rejected": -5.2211503982543945, "loss": 0.0491, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.377896308898926, "rewards/margins": 0.8432537317276001, "rewards/rejected": -5.2211503982543945, "sft_loss": 4.037580966949463, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 0.5310625596499312, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.4279257357120514, "logits/rejected": -0.46391788125038147, "logps/chosen": -4.196773052215576, "logps/rejected": -5.05009651184082, "loss": 0.0496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.196773052215576, "rewards/margins": 0.8533236384391785, "rewards/rejected": -5.05009651184082, "sft_loss": 3.8642678260803223, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 0.4117856488441471, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.2937605381011963, "logits/rejected": -0.20567739009857178, "logps/chosen": -4.333211421966553, "logps/rejected": -5.020572662353516, "loss": 0.0507, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.333211421966553, "rewards/margins": 0.687361478805542, "rewards/rejected": -5.020572662353516, "sft_loss": 3.9512009620666504, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 0.5959908535076706, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.2766355574131012, "logits/rejected": -0.09923405945301056, "logps/chosen": -4.339729309082031, "logps/rejected": -5.314494609832764, "loss": 0.0499, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.339729309082031, "rewards/margins": 0.9747658967971802, "rewards/rejected": -5.314494609832764, "sft_loss": 4.076969623565674, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 0.42216745478121104, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.32927799224853516, "logits/rejected": -0.20864331722259521, "logps/chosen": -4.405019283294678, "logps/rejected": -5.173530101776123, "loss": 0.0512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.405019283294678, "rewards/margins": 0.7685114145278931, "rewards/rejected": -5.173530101776123, "sft_loss": 4.068688869476318, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 0.5712246307382178, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.37075644731521606, "logits/rejected": -0.20269623398780823, "logps/chosen": -4.265493869781494, "logps/rejected": -5.115437030792236, "loss": 0.0501, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.265493869781494, "rewards/margins": 0.8499435186386108, "rewards/rejected": -5.115437030792236, "sft_loss": 4.042442321777344, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 0.6427814813306205, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.44174280762672424, "logits/rejected": -0.18351683020591736, "logps/chosen": -4.300289154052734, "logps/rejected": -5.178165912628174, "loss": 0.0495, "rewards/accuracies": 0.71875, "rewards/chosen": -4.300289154052734, "rewards/margins": 0.8778765797615051, "rewards/rejected": -5.178165912628174, "sft_loss": 3.9347262382507324, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 0.463317227859052, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.4231947362422943, "logits/rejected": -0.255962073802948, "logps/chosen": -4.436898708343506, "logps/rejected": -5.069366931915283, "loss": 0.051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.436898708343506, "rewards/margins": 0.6324674487113953, "rewards/rejected": -5.069366931915283, "sft_loss": 4.138705253601074, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 0.5303310864210551, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.38676175475120544, "logits/rejected": -0.2671288549900055, "logps/chosen": -4.541159152984619, "logps/rejected": -5.180571556091309, "loss": 0.0506, "rewards/accuracies": 0.6875, "rewards/chosen": -4.541159152984619, "rewards/margins": 0.639412522315979, "rewards/rejected": -5.180571556091309, "sft_loss": 4.238387107849121, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 0.49009820184561415, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.3614255487918854, "logits/rejected": -0.30020448565483093, "logps/chosen": -4.45319128036499, "logps/rejected": -5.10440731048584, "loss": 0.0512, "rewards/accuracies": 0.65625, "rewards/chosen": -4.45319128036499, "rewards/margins": 0.6512158513069153, "rewards/rejected": -5.10440731048584, "sft_loss": 4.143372535705566, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 0.4592325156306962, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.3967505991458893, "logits/rejected": -0.2671595811843872, "logps/chosen": -4.2683258056640625, "logps/rejected": -4.9582109451293945, "loss": 0.0523, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.2683258056640625, "rewards/margins": 0.6898849010467529, "rewards/rejected": -4.9582109451293945, "sft_loss": 3.975896120071411, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 0.4543301716930907, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.3207847774028778, "logits/rejected": -0.1029786691069603, "logps/chosen": -4.457320213317871, "logps/rejected": -5.187180519104004, "loss": 0.0512, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.457320213317871, "rewards/margins": 0.7298603057861328, "rewards/rejected": -5.187180519104004, "sft_loss": 4.13206148147583, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 0.46938640425653955, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.27763500809669495, "logits/rejected": -0.14167213439941406, "logps/chosen": -4.377208232879639, "logps/rejected": -5.241822242736816, "loss": 0.0506, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.377208232879639, "rewards/margins": 0.8646138310432434, "rewards/rejected": -5.241822242736816, "sft_loss": 4.1298441886901855, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 0.6623288312592525, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.36637941002845764, "logits/rejected": -0.12512700259685516, "logps/chosen": -4.084376335144043, "logps/rejected": -5.053424835205078, "loss": 0.0496, "rewards/accuracies": 0.75, "rewards/chosen": -4.084376335144043, "rewards/margins": 0.969048798084259, "rewards/rejected": -5.053424835205078, "sft_loss": 3.852780818939209, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 0.45516242129923573, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.270879864692688, "logits/rejected": -0.23090717196464539, "logps/chosen": -4.380623817443848, "logps/rejected": -5.0403642654418945, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.380623817443848, "rewards/margins": 0.6597407460212708, "rewards/rejected": -5.0403642654418945, "sft_loss": 4.06832218170166, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 0.5497102945676162, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.34165748953819275, "logits/rejected": -0.1425810605287552, "logps/chosen": -4.432342052459717, "logps/rejected": -5.2141642570495605, "loss": 0.0504, "rewards/accuracies": 0.71875, "rewards/chosen": -4.432342052459717, "rewards/margins": 0.7818223237991333, "rewards/rejected": -5.2141642570495605, "sft_loss": 4.182814121246338, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 0.5186126688524669, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.41190558671951294, "logits/rejected": -0.26131734251976013, "logps/chosen": -4.429064750671387, "logps/rejected": -5.08976936340332, "loss": 0.0513, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.429064750671387, "rewards/margins": 0.660704493522644, "rewards/rejected": -5.08976936340332, "sft_loss": 4.123114585876465, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 0.5294964456734004, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.30540773272514343, "logits/rejected": -0.25562867522239685, "logps/chosen": -4.42539119720459, "logps/rejected": -5.28076171875, "loss": 0.0491, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.42539119720459, "rewards/margins": 0.855370819568634, "rewards/rejected": -5.28076171875, "sft_loss": 4.095969200134277, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 0.5460324987819931, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.4230508804321289, "logits/rejected": -0.1935521364212036, "logps/chosen": -4.407500267028809, "logps/rejected": -5.24190616607666, "loss": 0.0509, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.407500267028809, "rewards/margins": 0.834405243396759, "rewards/rejected": -5.24190616607666, "sft_loss": 4.136740684509277, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 0.4644225159425639, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.3028753697872162, "logits/rejected": -0.3180554211139679, "logps/chosen": -4.3475775718688965, "logps/rejected": -5.271296501159668, "loss": 0.0486, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.3475775718688965, "rewards/margins": 0.9237188100814819, "rewards/rejected": -5.271296501159668, "sft_loss": 3.9962737560272217, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 0.4587484730030635, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.3752499222755432, "logits/rejected": -0.21907536685466766, "logps/chosen": -4.364150047302246, "logps/rejected": -5.151577472686768, "loss": 0.0494, "rewards/accuracies": 0.71875, "rewards/chosen": -4.364150047302246, "rewards/margins": 0.7874273061752319, "rewards/rejected": -5.151577472686768, "sft_loss": 4.01193904876709, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 0.5348042270404899, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.3915547728538513, "logits/rejected": -0.35350117087364197, "logps/chosen": -4.341032981872559, "logps/rejected": -5.086256980895996, "loss": 0.0526, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.341032981872559, "rewards/margins": 0.74522465467453, "rewards/rejected": -5.086256980895996, "sft_loss": 4.098066329956055, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 0.5572563588972967, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.2931649088859558, "logits/rejected": -0.10043120384216309, "logps/chosen": -4.398486137390137, "logps/rejected": -5.248486518859863, "loss": 0.0496, "rewards/accuracies": 0.75, "rewards/chosen": -4.398486137390137, "rewards/margins": 0.8499997854232788, "rewards/rejected": -5.248486518859863, "sft_loss": 4.0921549797058105, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 0.8956166487217501, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.25029462575912476, "logits/rejected": -0.37777963280677795, "logps/chosen": -4.2648515701293945, "logps/rejected": -4.919008255004883, "loss": 0.0502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.2648515701293945, "rewards/margins": 0.6541560888290405, "rewards/rejected": -4.919008255004883, "sft_loss": 3.9281792640686035, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 0.4677871805435953, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.40193086862564087, "logits/rejected": -0.22237029671669006, "logps/chosen": -4.344948768615723, "logps/rejected": -5.091097831726074, "loss": 0.0515, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.344948768615723, "rewards/margins": 0.7461491823196411, "rewards/rejected": -5.091097831726074, "sft_loss": 4.087722301483154, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 0.47711087871367364, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.3756980299949646, "logits/rejected": -0.34971341490745544, "logps/chosen": -4.448171615600586, "logps/rejected": -5.076592445373535, "loss": 0.0516, "rewards/accuracies": 0.6875, "rewards/chosen": -4.448171615600586, "rewards/margins": 0.6284207105636597, "rewards/rejected": -5.076592445373535, "sft_loss": 4.206480026245117, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 0.6378803019183643, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.31666165590286255, "logits/rejected": -0.04280168563127518, "logps/chosen": -4.166003227233887, "logps/rejected": -5.370185852050781, "loss": 0.0484, "rewards/accuracies": 0.84375, "rewards/chosen": -4.166003227233887, "rewards/margins": 1.2041823863983154, "rewards/rejected": -5.370185852050781, "sft_loss": 3.9951655864715576, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 0.4774801536345404, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.4547714293003082, "logits/rejected": -0.2548532783985138, "logps/chosen": -4.340898036956787, "logps/rejected": -5.196542263031006, "loss": 0.0508, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.340898036956787, "rewards/margins": 0.8556438684463501, "rewards/rejected": -5.196542263031006, "sft_loss": 4.143154144287109, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 1.1243775716679412, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.38904356956481934, "logits/rejected": -0.289012610912323, "logps/chosen": -4.332601547241211, "logps/rejected": -5.166747570037842, "loss": 0.0507, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.332601547241211, "rewards/margins": 0.8341460227966309, "rewards/rejected": -5.166747570037842, "sft_loss": 4.06862735748291, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 0.5001429058189203, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.38545042276382446, "logits/rejected": -0.16294018924236298, "logps/chosen": -4.417631149291992, "logps/rejected": -5.205663204193115, "loss": 0.0536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.417631149291992, "rewards/margins": 0.7880316972732544, "rewards/rejected": -5.205663204193115, "sft_loss": 4.275336265563965, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 0.7599250222037716, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.3082340955734253, "logits/rejected": -0.15593746304512024, "logps/chosen": -4.3028974533081055, "logps/rejected": -5.224949836730957, "loss": 0.05, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.3028974533081055, "rewards/margins": 0.9220517873764038, "rewards/rejected": -5.224949836730957, "sft_loss": 4.00002908706665, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 0.4147075847544824, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.3403518795967102, "logits/rejected": -0.2342265546321869, "logps/chosen": -4.296928882598877, "logps/rejected": -5.133492469787598, "loss": 0.0512, "rewards/accuracies": 0.6875, "rewards/chosen": -4.296928882598877, "rewards/margins": 0.836563766002655, "rewards/rejected": -5.133492469787598, "sft_loss": 3.9882049560546875, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 0.4884901044848451, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.4230726659297943, "logits/rejected": -0.30629175901412964, "logps/chosen": -4.268218040466309, "logps/rejected": -5.060492038726807, "loss": 0.0508, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.268218040466309, "rewards/margins": 0.7922734022140503, "rewards/rejected": -5.060492038726807, "sft_loss": 4.090802192687988, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 0.503796507703663, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.35504621267318726, "logits/rejected": -0.3044523596763611, "logps/chosen": -4.365761756896973, "logps/rejected": -5.030825138092041, "loss": 0.0517, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.365761756896973, "rewards/margins": 0.6650637984275818, "rewards/rejected": -5.030825138092041, "sft_loss": 4.040749549865723, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 0.5400763074236795, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.4038558900356293, "logits/rejected": -0.28878653049468994, "logps/chosen": -4.315426826477051, "logps/rejected": -5.185266494750977, "loss": 0.0513, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.315426826477051, "rewards/margins": 0.8698400259017944, "rewards/rejected": -5.185266494750977, "sft_loss": 4.082798957824707, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 0.7780856726072443, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.3635145425796509, "logits/rejected": -0.24427835643291473, "logps/chosen": -4.404698848724365, "logps/rejected": -5.149372100830078, "loss": 0.0512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.404698848724365, "rewards/margins": 0.7446734309196472, "rewards/rejected": -5.149372100830078, "sft_loss": 4.210453987121582, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 0.5490396914459478, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.359261691570282, "logits/rejected": -0.23605218529701233, "logps/chosen": -4.36661434173584, "logps/rejected": -5.151679515838623, "loss": 0.0513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.36661434173584, "rewards/margins": 0.7850648164749146, "rewards/rejected": -5.151679515838623, "sft_loss": 4.143317699432373, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.07847874611616135, "eval_logits/rejected": 0.17865531146526337, "eval_logps/chosen": -4.370919227600098, "eval_logps/rejected": -5.116001605987549, "eval_loss": 0.04999160394072533, "eval_rewards/accuracies": 0.6943620443344116, "eval_rewards/chosen": -4.370919227600098, "eval_rewards/margins": 0.7450823187828064, "eval_rewards/rejected": -5.116001605987549, "eval_runtime": 44.4974, "eval_samples_per_second": 30.226, "eval_sft_loss": 3.973156213760376, "eval_steps_per_second": 7.573, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 0.4366784779733865, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.4094223976135254, "logits/rejected": -0.3121436834335327, "logps/chosen": -4.305196285247803, "logps/rejected": -5.099860191345215, "loss": 0.0508, "rewards/accuracies": 0.6875, "rewards/chosen": -4.305196285247803, "rewards/margins": 0.7946635484695435, "rewards/rejected": -5.099860191345215, "sft_loss": 4.0197625160217285, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 0.785271503992536, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.3667059540748596, "logits/rejected": -0.289445161819458, "logps/chosen": -4.2451043128967285, "logps/rejected": -5.16410493850708, "loss": 0.0501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.2451043128967285, "rewards/margins": 0.9190011024475098, "rewards/rejected": -5.16410493850708, "sft_loss": 3.976490020751953, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 0.657010538300424, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.36434513330459595, "logits/rejected": -0.198884516954422, "logps/chosen": -4.356508731842041, "logps/rejected": -5.266931056976318, "loss": 0.0515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.356508731842041, "rewards/margins": 0.9104223251342773, "rewards/rejected": -5.266931056976318, "sft_loss": 4.159215927124023, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 0.5736893794094124, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.3771006464958191, "logits/rejected": -0.2650943994522095, "logps/chosen": -4.496792793273926, "logps/rejected": -5.096777439117432, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.496792793273926, "rewards/margins": 0.5999849438667297, "rewards/rejected": -5.096777439117432, "sft_loss": 4.190961837768555, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 0.5535433811032741, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.31344643235206604, "logits/rejected": -0.15652044117450714, "logps/chosen": -4.29534387588501, "logps/rejected": -5.13301944732666, "loss": 0.0496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.29534387588501, "rewards/margins": 0.837675929069519, "rewards/rejected": -5.13301944732666, "sft_loss": 4.011329174041748, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 0.5464049154304894, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.3962657153606415, "logits/rejected": -0.20990128815174103, "logps/chosen": -4.331116676330566, "logps/rejected": -5.164947509765625, "loss": 0.0504, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.331116676330566, "rewards/margins": 0.8338314890861511, "rewards/rejected": -5.164947509765625, "sft_loss": 4.151193618774414, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 0.5312457533985412, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.44756752252578735, "logits/rejected": -0.2316436767578125, "logps/chosen": -4.406530857086182, "logps/rejected": -5.2714948654174805, "loss": 0.0499, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.406530857086182, "rewards/margins": 0.8649638295173645, "rewards/rejected": -5.2714948654174805, "sft_loss": 4.123922824859619, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 0.6044695941107733, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.4090364873409271, "logits/rejected": -0.15606291592121124, "logps/chosen": -4.223238945007324, "logps/rejected": -5.190573692321777, "loss": 0.0475, "rewards/accuracies": 0.71875, "rewards/chosen": -4.223238945007324, "rewards/margins": 0.9673342704772949, "rewards/rejected": -5.190573692321777, "sft_loss": 3.814497709274292, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 0.9832175459629173, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.3239671587944031, "logits/rejected": -0.2865348160266876, "logps/chosen": -4.409046173095703, "logps/rejected": -5.107858657836914, "loss": 0.0512, "rewards/accuracies": 0.6875, "rewards/chosen": -4.409046173095703, "rewards/margins": 0.69881272315979, "rewards/rejected": -5.107858657836914, "sft_loss": 4.158252716064453, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 0.5454129043795762, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.5119531750679016, "logits/rejected": -0.30409321188926697, "logps/chosen": -4.390221118927002, "logps/rejected": -5.251105785369873, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.390221118927002, "rewards/margins": 0.8608850240707397, "rewards/rejected": -5.251105785369873, "sft_loss": 4.141351222991943, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 0.5888971213716986, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.28511926531791687, "logits/rejected": -0.11991135776042938, "logps/chosen": -4.358060359954834, "logps/rejected": -5.080539703369141, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.358060359954834, "rewards/margins": 0.7224793434143066, "rewards/rejected": -5.080539703369141, "sft_loss": 4.084187030792236, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 0.7700086644895395, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.3371310234069824, "logits/rejected": -0.21796874701976776, "logps/chosen": -4.2935686111450195, "logps/rejected": -5.074094295501709, "loss": 0.0521, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.2935686111450195, "rewards/margins": 0.7805261611938477, "rewards/rejected": -5.074094295501709, "sft_loss": 4.1205830574035645, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 0.49366993906433826, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.45760640501976013, "logits/rejected": -0.35235825181007385, "logps/chosen": -4.228287220001221, "logps/rejected": -5.1750922203063965, "loss": 0.049, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.228287220001221, "rewards/margins": 0.9468050003051758, "rewards/rejected": -5.1750922203063965, "sft_loss": 4.011025428771973, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 0.6999895260792679, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.2858087420463562, "logits/rejected": -0.16768726706504822, "logps/chosen": -4.2022809982299805, "logps/rejected": -5.1616997718811035, "loss": 0.0494, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.2022809982299805, "rewards/margins": 0.9594185948371887, "rewards/rejected": -5.1616997718811035, "sft_loss": 3.9046719074249268, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 0.731734899158958, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.2684374451637268, "logits/rejected": -0.16356366872787476, "logps/chosen": -4.374823093414307, "logps/rejected": -5.219290733337402, "loss": 0.0511, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.374823093414307, "rewards/margins": 0.8444677591323853, "rewards/rejected": -5.219290733337402, "sft_loss": 4.082675457000732, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 0.6332443526161032, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.27860182523727417, "logits/rejected": -0.15365949273109436, "logps/chosen": -4.211956024169922, "logps/rejected": -5.227756023406982, "loss": 0.0498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.211956024169922, "rewards/margins": 1.01580011844635, "rewards/rejected": -5.227756023406982, "sft_loss": 3.9831924438476562, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 0.5668796452945385, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.27431243658065796, "logits/rejected": -0.22958464920520782, "logps/chosen": -4.336443901062012, "logps/rejected": -5.17434024810791, "loss": 0.0511, "rewards/accuracies": 0.71875, "rewards/chosen": -4.336443901062012, "rewards/margins": 0.8378962278366089, "rewards/rejected": -5.17434024810791, "sft_loss": 4.1342644691467285, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 0.5156007934403071, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.4549378752708435, "logits/rejected": -0.2466200590133667, "logps/chosen": -4.278968811035156, "logps/rejected": -5.1642656326293945, "loss": 0.0507, "rewards/accuracies": 0.75, "rewards/chosen": -4.278968811035156, "rewards/margins": 0.8852967023849487, "rewards/rejected": -5.1642656326293945, "sft_loss": 4.013367652893066, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 0.5975897826628281, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.3374733030796051, "logits/rejected": -0.2785305976867676, "logps/chosen": -4.33203649520874, "logps/rejected": -5.043580055236816, "loss": 0.051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.33203649520874, "rewards/margins": 0.7115433812141418, "rewards/rejected": -5.043580055236816, "sft_loss": 4.101061820983887, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 0.7935012047266726, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.30603188276290894, "logits/rejected": -0.1277734935283661, "logps/chosen": -4.321929931640625, "logps/rejected": -5.279784202575684, "loss": 0.05, "rewards/accuracies": 0.71875, "rewards/chosen": -4.321929931640625, "rewards/margins": 0.9578543901443481, "rewards/rejected": -5.279784202575684, "sft_loss": 4.039951801300049, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 0.5670757081729894, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.32785436511039734, "logits/rejected": -0.17707887291908264, "logps/chosen": -4.282454013824463, "logps/rejected": -5.355355739593506, "loss": 0.0481, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.282454013824463, "rewards/margins": 1.072901964187622, "rewards/rejected": -5.355355739593506, "sft_loss": 4.057803630828857, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 0.7615789792899859, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.3168491721153259, "logits/rejected": -0.37255367636680603, "logps/chosen": -4.4251322746276855, "logps/rejected": -5.246760368347168, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.4251322746276855, "rewards/margins": 0.8216277360916138, "rewards/rejected": -5.246760368347168, "sft_loss": 4.182235240936279, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 0.4919973143825708, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.3505459725856781, "logits/rejected": -0.1527949571609497, "logps/chosen": -4.304749488830566, "logps/rejected": -5.080583095550537, "loss": 0.0504, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.304749488830566, "rewards/margins": 0.7758339643478394, "rewards/rejected": -5.080583095550537, "sft_loss": 4.022045612335205, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 0.677617849418434, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.37543028593063354, "logits/rejected": -0.2667499780654907, "logps/chosen": -4.324775218963623, "logps/rejected": -5.045456409454346, "loss": 0.0503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.324775218963623, "rewards/margins": 0.7206807136535645, "rewards/rejected": -5.045456409454346, "sft_loss": 4.059634208679199, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 0.573609389327282, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.5110586881637573, "logits/rejected": -0.22267436981201172, "logps/chosen": -4.291056156158447, "logps/rejected": -5.223393440246582, "loss": 0.0495, "rewards/accuracies": 0.75, "rewards/chosen": -4.291056156158447, "rewards/margins": 0.9323371648788452, "rewards/rejected": -5.223393440246582, "sft_loss": 4.041526794433594, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 0.5475086704999267, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.3410526216030121, "logits/rejected": -0.11093126237392426, "logps/chosen": -4.463144779205322, "logps/rejected": -5.343132019042969, "loss": 0.0511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.463144779205322, "rewards/margins": 0.8799868822097778, "rewards/rejected": -5.343132019042969, "sft_loss": 4.178741455078125, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 0.48781828384348774, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.42462921142578125, "logits/rejected": -0.2016540765762329, "logps/chosen": -4.22618293762207, "logps/rejected": -5.108307838439941, "loss": 0.0498, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.22618293762207, "rewards/margins": 0.8821243047714233, "rewards/rejected": -5.108307838439941, "sft_loss": 3.8731770515441895, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 0.5380379219048013, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.4162037968635559, "logits/rejected": -0.32498764991760254, "logps/chosen": -4.193484306335449, "logps/rejected": -4.916443824768066, "loss": 0.0501, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.193484306335449, "rewards/margins": 0.7229597568511963, "rewards/rejected": -4.916443824768066, "sft_loss": 3.9398722648620605, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 0.553196750733812, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.3653312027454376, "logits/rejected": -0.14281252026557922, "logps/chosen": -4.307368278503418, "logps/rejected": -5.142277240753174, "loss": 0.0498, "rewards/accuracies": 0.71875, "rewards/chosen": -4.307368278503418, "rewards/margins": 0.8349090814590454, "rewards/rejected": -5.142277240753174, "sft_loss": 3.9290771484375, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 0.6956238773053149, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.32470768690109253, "logits/rejected": -0.12303704023361206, "logps/chosen": -4.257458686828613, "logps/rejected": -5.020730018615723, "loss": 0.0512, "rewards/accuracies": 0.71875, "rewards/chosen": -4.257458686828613, "rewards/margins": 0.763271689414978, "rewards/rejected": -5.020730018615723, "sft_loss": 3.9516940116882324, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 0.5982140764367317, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.3422200381755829, "logits/rejected": -0.31732505559921265, "logps/chosen": -4.543044567108154, "logps/rejected": -5.1809258460998535, "loss": 0.0505, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.543044567108154, "rewards/margins": 0.6378811001777649, "rewards/rejected": -5.1809258460998535, "sft_loss": 4.218386650085449, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 0.5497994656475738, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.3104451298713684, "logits/rejected": -0.2130269706249237, "logps/chosen": -4.269969940185547, "logps/rejected": -5.149529457092285, "loss": 0.0509, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.269969940185547, "rewards/margins": 0.8795592188835144, "rewards/rejected": -5.149529457092285, "sft_loss": 4.051764965057373, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 0.445435483211514, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.4530177712440491, "logits/rejected": -0.3011803925037384, "logps/chosen": -4.275309085845947, "logps/rejected": -5.169306755065918, "loss": 0.0502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.275309085845947, "rewards/margins": 0.8939980268478394, "rewards/rejected": -5.169306755065918, "sft_loss": 4.088404655456543, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 0.5856523820842301, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.23590047657489777, "logits/rejected": -0.20851969718933105, "logps/chosen": -4.351941108703613, "logps/rejected": -5.117374897003174, "loss": 0.0489, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.351941108703613, "rewards/margins": 0.7654340863227844, "rewards/rejected": -5.117374897003174, "sft_loss": 3.916269302368164, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 0.6735940707026237, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.29202502965927124, "logits/rejected": -0.2616046965122223, "logps/chosen": -4.410584926605225, "logps/rejected": -5.258556365966797, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.410584926605225, "rewards/margins": 0.8479716181755066, "rewards/rejected": -5.258556365966797, "sft_loss": 4.140559196472168, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 0.45364675526602016, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.18361981213092804, "logits/rejected": -0.17511440813541412, "logps/chosen": -4.39959716796875, "logps/rejected": -5.171971321105957, "loss": 0.0512, "rewards/accuracies": 0.6875, "rewards/chosen": -4.39959716796875, "rewards/margins": 0.7723743915557861, "rewards/rejected": -5.171971321105957, "sft_loss": 4.043013572692871, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 0.5784270412666574, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.2834405303001404, "logits/rejected": -0.0836217850446701, "logps/chosen": -4.3537139892578125, "logps/rejected": -5.3170366287231445, "loss": 0.0495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.3537139892578125, "rewards/margins": 0.9633221626281738, "rewards/rejected": -5.3170366287231445, "sft_loss": 4.033775329589844, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 0.530877020771359, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.4802896976470947, "logits/rejected": -0.29643306136131287, "logps/chosen": -4.233144760131836, "logps/rejected": -5.321981906890869, "loss": 0.0488, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.233144760131836, "rewards/margins": 1.088836669921875, "rewards/rejected": -5.321981906890869, "sft_loss": 4.009449481964111, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 0.6747135053910271, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.385011225938797, "logits/rejected": -0.4125341773033142, "logps/chosen": -4.425052642822266, "logps/rejected": -5.143741607666016, "loss": 0.0518, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.425052642822266, "rewards/margins": 0.7186892628669739, "rewards/rejected": -5.143741607666016, "sft_loss": 4.189584732055664, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 0.5537563795116268, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.35988473892211914, "logits/rejected": -0.3347950577735901, "logps/chosen": -4.537070274353027, "logps/rejected": -5.29421854019165, "loss": 0.0512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.537070274353027, "rewards/margins": 0.7571475505828857, "rewards/rejected": -5.29421854019165, "sft_loss": 4.247420310974121, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 0.6340292788783681, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.31119558215141296, "logits/rejected": -0.24769540131092072, "logps/chosen": -4.337439060211182, "logps/rejected": -5.011359214782715, "loss": 0.0505, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.337439060211182, "rewards/margins": 0.673919677734375, "rewards/rejected": -5.011359214782715, "sft_loss": 4.067869186401367, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 0.4995947109233343, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.27810150384902954, "logits/rejected": -0.22744593024253845, "logps/chosen": -4.439764976501465, "logps/rejected": -5.297387599945068, "loss": 0.0498, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.439764976501465, "rewards/margins": 0.8576227426528931, "rewards/rejected": -5.297387599945068, "sft_loss": 4.06976842880249, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 0.4910614447723621, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.3293796181678772, "logits/rejected": -0.2629985809326172, "logps/chosen": -4.394562244415283, "logps/rejected": -5.318913459777832, "loss": 0.0489, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.394562244415283, "rewards/margins": 0.9243508577346802, "rewards/rejected": -5.318913459777832, "sft_loss": 4.018808364868164, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 0.5700669196553817, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.42885392904281616, "logits/rejected": -0.15153753757476807, "logps/chosen": -4.297393798828125, "logps/rejected": -5.1750288009643555, "loss": 0.0496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.297393798828125, "rewards/margins": 0.87763512134552, "rewards/rejected": -5.1750288009643555, "sft_loss": 4.046809196472168, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 0.4570361869947965, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.41207781434059143, "logits/rejected": -0.2339569628238678, "logps/chosen": -4.3805341720581055, "logps/rejected": -5.119570255279541, "loss": 0.0515, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.3805341720581055, "rewards/margins": 0.7390362024307251, "rewards/rejected": -5.119570255279541, "sft_loss": 4.159833908081055, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 0.6189906085316598, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.3554043173789978, "logits/rejected": -0.2785646915435791, "logps/chosen": -4.2589263916015625, "logps/rejected": -5.20426082611084, "loss": 0.0502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.2589263916015625, "rewards/margins": 0.9453340768814087, "rewards/rejected": -5.20426082611084, "sft_loss": 4.017521858215332, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 0.4158614258443814, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.3380299210548401, "logits/rejected": -0.24163658916950226, "logps/chosen": -4.41104793548584, "logps/rejected": -5.293055534362793, "loss": 0.0509, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.41104793548584, "rewards/margins": 0.8820083737373352, "rewards/rejected": -5.293055534362793, "sft_loss": 4.177367210388184, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 0.6648398728304554, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.1934209167957306, "logits/rejected": -0.25234749913215637, "logps/chosen": -4.489556312561035, "logps/rejected": -5.1544575691223145, "loss": 0.0514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.489556312561035, "rewards/margins": 0.6649015545845032, "rewards/rejected": -5.1544575691223145, "sft_loss": 4.170198917388916, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 0.8852667353919006, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.22296173870563507, "logits/rejected": -0.2605739235877991, "logps/chosen": -4.362579345703125, "logps/rejected": -5.030747413635254, "loss": 0.05, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.362579345703125, "rewards/margins": 0.6681679487228394, "rewards/rejected": -5.030747413635254, "sft_loss": 4.08138370513916, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 0.865852246921178, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.32960090041160583, "logits/rejected": -0.24417448043823242, "logps/chosen": -4.399921894073486, "logps/rejected": -5.093794822692871, "loss": 0.0506, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.399921894073486, "rewards/margins": 0.6938729882240295, "rewards/rejected": -5.093794822692871, "sft_loss": 4.056646347045898, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 0.5888597395995303, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.4537014067173004, "logits/rejected": -0.37281662225723267, "logps/chosen": -4.381228446960449, "logps/rejected": -5.203554630279541, "loss": 0.0502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.381228446960449, "rewards/margins": 0.8223265409469604, "rewards/rejected": -5.203554630279541, "sft_loss": 4.094564914703369, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 0.7002768460470424, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.28774571418762207, "logits/rejected": -0.24743354320526123, "logps/chosen": -4.308173179626465, "logps/rejected": -5.016545295715332, "loss": 0.0519, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.308173179626465, "rewards/margins": 0.7083726525306702, "rewards/rejected": -5.016545295715332, "sft_loss": 4.048774242401123, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 0.5846364610229815, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.31515955924987793, "logits/rejected": -0.04892424866557121, "logps/chosen": -4.327048301696777, "logps/rejected": -5.070046424865723, "loss": 0.0505, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.327048301696777, "rewards/margins": 0.7429983019828796, "rewards/rejected": -5.070046424865723, "sft_loss": 4.062623023986816, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 0.5600521422021959, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.2908742427825928, "logits/rejected": -0.13980403542518616, "logps/chosen": -4.316672325134277, "logps/rejected": -5.053095817565918, "loss": 0.05, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.316672325134277, "rewards/margins": 0.7364233732223511, "rewards/rejected": -5.053095817565918, "sft_loss": 4.109248161315918, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 0.6693016025580347, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.31663063168525696, "logits/rejected": -0.26752668619155884, "logps/chosen": -4.199077606201172, "logps/rejected": -4.882410049438477, "loss": 0.0522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.199077606201172, "rewards/margins": 0.6833322644233704, "rewards/rejected": -4.882410049438477, "sft_loss": 4.017780303955078, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 0.5585138005543521, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.4366474151611328, "logits/rejected": -0.26478832960128784, "logps/chosen": -4.335009574890137, "logps/rejected": -5.138851165771484, "loss": 0.0512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.335009574890137, "rewards/margins": 0.8038414716720581, "rewards/rejected": -5.138851165771484, "sft_loss": 4.115544319152832, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 0.5115652071606784, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.39152833819389343, "logits/rejected": -0.25694766640663147, "logps/chosen": -4.440808296203613, "logps/rejected": -5.392234802246094, "loss": 0.0503, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.440808296203613, "rewards/margins": 0.9514263272285461, "rewards/rejected": -5.392234802246094, "sft_loss": 4.244625568389893, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 0.4685102979697998, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.48699942231178284, "logits/rejected": -0.27683359384536743, "logps/chosen": -4.237309455871582, "logps/rejected": -5.1978759765625, "loss": 0.0488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.237309455871582, "rewards/margins": 0.9605666995048523, "rewards/rejected": -5.1978759765625, "sft_loss": 3.9901108741760254, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 0.4037980863154395, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.5207828283309937, "logits/rejected": -0.2705624997615814, "logps/chosen": -4.286375045776367, "logps/rejected": -5.243197441101074, "loss": 0.0501, "rewards/accuracies": 0.75, "rewards/chosen": -4.286375045776367, "rewards/margins": 0.9568222761154175, "rewards/rejected": -5.243197441101074, "sft_loss": 4.061440944671631, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 0.5770932093871353, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.30437716841697693, "logits/rejected": -0.14506526291370392, "logps/chosen": -4.377502918243408, "logps/rejected": -5.2365498542785645, "loss": 0.0513, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.377502918243408, "rewards/margins": 0.859046459197998, "rewards/rejected": -5.2365498542785645, "sft_loss": 4.1016740798950195, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 0.5933158360229613, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.3560345768928528, "logits/rejected": -0.169934943318367, "logps/chosen": -4.517114162445068, "logps/rejected": -5.2097673416137695, "loss": 0.05, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.517114162445068, "rewards/margins": 0.6926525831222534, "rewards/rejected": -5.2097673416137695, "sft_loss": 4.1477837562561035, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 0.4786597019537258, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.35335972905158997, "logits/rejected": -0.27022966742515564, "logps/chosen": -4.437841892242432, "logps/rejected": -5.270761013031006, "loss": 0.05, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.437841892242432, "rewards/margins": 0.8329197764396667, "rewards/rejected": -5.270761013031006, "sft_loss": 4.032576084136963, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 0.4840975269990418, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.48118534684181213, "logits/rejected": -0.25537365674972534, "logps/chosen": -4.186007022857666, "logps/rejected": -5.362186908721924, "loss": 0.0484, "rewards/accuracies": 0.75, "rewards/chosen": -4.186007022857666, "rewards/margins": 1.1761797666549683, "rewards/rejected": -5.362186908721924, "sft_loss": 3.971787929534912, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 0.6293456593288709, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.35397639870643616, "logits/rejected": -0.2746841311454773, "logps/chosen": -4.240942478179932, "logps/rejected": -5.1576995849609375, "loss": 0.0501, "rewards/accuracies": 0.78125, "rewards/chosen": -4.240942478179932, "rewards/margins": 0.916756808757782, "rewards/rejected": -5.1576995849609375, "sft_loss": 4.038487911224365, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 0.6509939909042833, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.5028539299964905, "logits/rejected": -0.22538447380065918, "logps/chosen": -4.42642068862915, "logps/rejected": -5.166515350341797, "loss": 0.0505, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.42642068862915, "rewards/margins": 0.7400942444801331, "rewards/rejected": -5.166515350341797, "sft_loss": 4.125155448913574, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 0.4241187697346418, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.5067978501319885, "logits/rejected": -0.2537830173969269, "logps/chosen": -4.308220863342285, "logps/rejected": -5.137172222137451, "loss": 0.0507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.308220863342285, "rewards/margins": 0.828951358795166, "rewards/rejected": -5.137172222137451, "sft_loss": 4.0658135414123535, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 0.6211864652488323, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.26569199562072754, "logits/rejected": -0.2149532586336136, "logps/chosen": -4.469195365905762, "logps/rejected": -5.14188289642334, "loss": 0.0519, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.469195365905762, "rewards/margins": 0.6726875901222229, "rewards/rejected": -5.14188289642334, "sft_loss": 4.128327369689941, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 0.6295297741777652, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.30820053815841675, "logits/rejected": -0.16536325216293335, "logps/chosen": -4.0469970703125, "logps/rejected": -5.018959045410156, "loss": 0.0492, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.0469970703125, "rewards/margins": 0.9719620943069458, "rewards/rejected": -5.018959045410156, "sft_loss": 3.805692195892334, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 0.6033387892484273, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.2960502803325653, "logits/rejected": -0.14533351361751556, "logps/chosen": -4.287169456481934, "logps/rejected": -5.153046131134033, "loss": 0.0497, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.287169456481934, "rewards/margins": 0.8658763766288757, "rewards/rejected": -5.153046131134033, "sft_loss": 3.994807004928589, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 0.3556324310223384, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.41091519594192505, "logits/rejected": -0.25811105966567993, "logps/chosen": -4.2101640701293945, "logps/rejected": -5.0592851638793945, "loss": 0.05, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.2101640701293945, "rewards/margins": 0.8491213917732239, "rewards/rejected": -5.0592851638793945, "sft_loss": 3.965034008026123, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 0.45918008796138426, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.42323416471481323, "logits/rejected": -0.33379656076431274, "logps/chosen": -4.424660682678223, "logps/rejected": -5.175339698791504, "loss": 0.0511, "rewards/accuracies": 0.71875, "rewards/chosen": -4.424660682678223, "rewards/margins": 0.7506788372993469, "rewards/rejected": -5.175339698791504, "sft_loss": 4.200197696685791, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 0.4557503980800935, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.35138794779777527, "logits/rejected": -0.34763628244400024, "logps/chosen": -4.337948322296143, "logps/rejected": -5.078485012054443, "loss": 0.0509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.337948322296143, "rewards/margins": 0.7405366897583008, "rewards/rejected": -5.078485012054443, "sft_loss": 4.110854148864746, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 0.4334158893596347, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.3873664140701294, "logits/rejected": -0.18847136199474335, "logps/chosen": -4.274514675140381, "logps/rejected": -5.120308876037598, "loss": 0.0503, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.274514675140381, "rewards/margins": 0.8457947969436646, "rewards/rejected": -5.120308876037598, "sft_loss": 4.066677570343018, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 0.5085900205081568, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.3473703861236572, "logits/rejected": -0.23684850335121155, "logps/chosen": -4.316211700439453, "logps/rejected": -5.224501609802246, "loss": 0.0492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.316211700439453, "rewards/margins": 0.9082896113395691, "rewards/rejected": -5.224501609802246, "sft_loss": 4.066177845001221, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 0.5671962752351475, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.34605956077575684, "logits/rejected": -0.28665685653686523, "logps/chosen": -4.406753063201904, "logps/rejected": -5.275516510009766, "loss": 0.0489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.406753063201904, "rewards/margins": 0.8687634468078613, "rewards/rejected": -5.275516510009766, "sft_loss": 4.098984241485596, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 0.6257159164544014, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.30671659111976624, "logits/rejected": -0.23999974131584167, "logps/chosen": -4.360651969909668, "logps/rejected": -5.106849193572998, "loss": 0.0516, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.360651969909668, "rewards/margins": 0.7461972832679749, "rewards/rejected": -5.106849193572998, "sft_loss": 4.17466402053833, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 0.4299714719732261, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.30148938298225403, "logits/rejected": -0.14409419894218445, "logps/chosen": -4.323808193206787, "logps/rejected": -5.228428840637207, "loss": 0.0502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.323808193206787, "rewards/margins": 0.9046202898025513, "rewards/rejected": -5.228428840637207, "sft_loss": 4.030442237854004, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 0.5583552209547061, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.45117372274398804, "logits/rejected": -0.25930553674697876, "logps/chosen": -4.293408393859863, "logps/rejected": -5.134814262390137, "loss": 0.0495, "rewards/accuracies": 0.78125, "rewards/chosen": -4.293408393859863, "rewards/margins": 0.841405987739563, "rewards/rejected": -5.134814262390137, "sft_loss": 4.050257682800293, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 0.5503319371421598, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.30263951420783997, "logits/rejected": -0.1634255200624466, "logps/chosen": -4.305006980895996, "logps/rejected": -5.143341064453125, "loss": 0.0488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.305006980895996, "rewards/margins": 0.8383339047431946, "rewards/rejected": -5.143341064453125, "sft_loss": 4.050868988037109, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 0.44537831132872474, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.4748370051383972, "logits/rejected": -0.2880566418170929, "logps/chosen": -4.3707966804504395, "logps/rejected": -5.182746410369873, "loss": 0.0498, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.3707966804504395, "rewards/margins": 0.8119505643844604, "rewards/rejected": -5.182746410369873, "sft_loss": 3.9548544883728027, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.11339718848466873, "eval_logits/rejected": 0.21380193531513214, "eval_logps/chosen": -4.331807613372803, "eval_logps/rejected": -5.096880912780762, "eval_loss": 0.049965761601924896, "eval_rewards/accuracies": 0.68916916847229, "eval_rewards/chosen": -4.331807613372803, "eval_rewards/margins": 0.7650735974311829, "eval_rewards/rejected": -5.096880912780762, "eval_runtime": 44.5977, "eval_samples_per_second": 30.159, "eval_sft_loss": 3.937229871749878, "eval_steps_per_second": 7.556, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 0.63650009255052, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.3355264961719513, "logits/rejected": -0.3086664378643036, "logps/chosen": -4.177042007446289, "logps/rejected": -5.167255401611328, "loss": 0.0491, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.177042007446289, "rewards/margins": 0.9902137517929077, "rewards/rejected": -5.167255401611328, "sft_loss": 3.9284653663635254, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 0.6238382815083907, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.36723607778549194, "logits/rejected": -0.25406724214553833, "logps/chosen": -4.518461227416992, "logps/rejected": -5.312617301940918, "loss": 0.051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.518461227416992, "rewards/margins": 0.7941561937332153, "rewards/rejected": -5.312617301940918, "sft_loss": 4.1740217208862305, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 0.4804024279136467, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.3804596960544586, "logits/rejected": -0.2999555468559265, "logps/chosen": -4.189560413360596, "logps/rejected": -5.1501569747924805, "loss": 0.0492, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.189560413360596, "rewards/margins": 0.9605971574783325, "rewards/rejected": -5.1501569747924805, "sft_loss": 4.015095233917236, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 0.532833051713504, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.3272797763347626, "logits/rejected": -0.2577647864818573, "logps/chosen": -4.291745185852051, "logps/rejected": -5.294327735900879, "loss": 0.0488, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.291745185852051, "rewards/margins": 1.00258207321167, "rewards/rejected": -5.294327735900879, "sft_loss": 4.015843391418457, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 0.5326192755761749, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.3303220868110657, "logits/rejected": -0.1386190950870514, "logps/chosen": -4.393420219421387, "logps/rejected": -5.122389793395996, "loss": 0.0522, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.393420219421387, "rewards/margins": 0.7289689779281616, "rewards/rejected": -5.122389793395996, "sft_loss": 4.1007585525512695, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 0.6058086369493448, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.43916431069374084, "logits/rejected": -0.2893034517765045, "logps/chosen": -4.339142799377441, "logps/rejected": -5.268829822540283, "loss": 0.0484, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.339142799377441, "rewards/margins": 0.9296862483024597, "rewards/rejected": -5.268829822540283, "sft_loss": 3.947129487991333, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 0.43408048665096755, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.43070071935653687, "logits/rejected": -0.25799840688705444, "logps/chosen": -4.269518852233887, "logps/rejected": -5.2071733474731445, "loss": 0.0492, "rewards/accuracies": 0.75, "rewards/chosen": -4.269518852233887, "rewards/margins": 0.9376543164253235, "rewards/rejected": -5.2071733474731445, "sft_loss": 3.977576494216919, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 0.5127739715217935, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.43267449736595154, "logits/rejected": -0.20683661103248596, "logps/chosen": -4.34108304977417, "logps/rejected": -5.199608325958252, "loss": 0.0495, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.34108304977417, "rewards/margins": 0.8585250973701477, "rewards/rejected": -5.199608325958252, "sft_loss": 4.119553089141846, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 0.44168776044394253, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.35988324880599976, "logits/rejected": -0.3104974329471588, "logps/chosen": -4.353398323059082, "logps/rejected": -5.132560729980469, "loss": 0.0511, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.353398323059082, "rewards/margins": 0.779162585735321, "rewards/rejected": -5.132560729980469, "sft_loss": 4.198001384735107, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 0.5776884999882473, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.46909505128860474, "logits/rejected": -0.30127614736557007, "logps/chosen": -4.262785911560059, "logps/rejected": -5.127320766448975, "loss": 0.0505, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.262785911560059, "rewards/margins": 0.8645352125167847, "rewards/rejected": -5.127320766448975, "sft_loss": 4.043597221374512, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 0.5770063136674014, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.4060916006565094, "logits/rejected": -0.16923363506793976, "logps/chosen": -4.230871677398682, "logps/rejected": -5.168854713439941, "loss": 0.0502, "rewards/accuracies": 0.75, "rewards/chosen": -4.230871677398682, "rewards/margins": 0.9379828572273254, "rewards/rejected": -5.168854713439941, "sft_loss": 4.0604095458984375, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 0.48959950508599864, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.3215634226799011, "logits/rejected": -0.2579534649848938, "logps/chosen": -4.367952346801758, "logps/rejected": -5.0876593589782715, "loss": 0.053, "rewards/accuracies": 0.625, "rewards/chosen": -4.367952346801758, "rewards/margins": 0.7197073698043823, "rewards/rejected": -5.0876593589782715, "sft_loss": 4.103386878967285, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 0.4936544311232592, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.48994073271751404, "logits/rejected": -0.23816998302936554, "logps/chosen": -4.147706031799316, "logps/rejected": -5.310173988342285, "loss": 0.0476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.147706031799316, "rewards/margins": 1.162468671798706, "rewards/rejected": -5.310173988342285, "sft_loss": 3.913297176361084, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 0.48184078763381777, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.3174903988838196, "logits/rejected": -0.1744978427886963, "logps/chosen": -4.236815452575684, "logps/rejected": -5.054623603820801, "loss": 0.0503, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.236815452575684, "rewards/margins": 0.8178078532218933, "rewards/rejected": -5.054623603820801, "sft_loss": 4.0225510597229, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 0.49873076680439726, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.3413674831390381, "logits/rejected": -0.19086118042469025, "logps/chosen": -4.315398216247559, "logps/rejected": -5.07651424407959, "loss": 0.0504, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.315398216247559, "rewards/margins": 0.7611164450645447, "rewards/rejected": -5.07651424407959, "sft_loss": 4.1059393882751465, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 0.49656743919960683, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.3963681161403656, "logits/rejected": -0.16303816437721252, "logps/chosen": -4.286141395568848, "logps/rejected": -5.307049751281738, "loss": 0.0503, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.286141395568848, "rewards/margins": 1.0209077596664429, "rewards/rejected": -5.307049751281738, "sft_loss": 3.983715772628784, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 0.44754978271656065, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.4166085124015808, "logits/rejected": -0.284808874130249, "logps/chosen": -4.283874034881592, "logps/rejected": -5.237959861755371, "loss": 0.05, "rewards/accuracies": 0.78125, "rewards/chosen": -4.283874034881592, "rewards/margins": 0.954084575176239, "rewards/rejected": -5.237959861755371, "sft_loss": 4.053045749664307, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 0.5852176436351291, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.4020010828971863, "logits/rejected": -0.2330581694841385, "logps/chosen": -4.204090118408203, "logps/rejected": -5.062168598175049, "loss": 0.0493, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.204090118408203, "rewards/margins": 0.8580780029296875, "rewards/rejected": -5.062168598175049, "sft_loss": 3.945356845855713, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 0.5227330600678359, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.42851123213768005, "logits/rejected": -0.21556782722473145, "logps/chosen": -4.467846870422363, "logps/rejected": -5.213390350341797, "loss": 0.0505, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.467846870422363, "rewards/margins": 0.745543897151947, "rewards/rejected": -5.213390350341797, "sft_loss": 4.137904644012451, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 0.7779699882330686, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.3555363416671753, "logits/rejected": -0.2877839207649231, "logps/chosen": -4.3804931640625, "logps/rejected": -5.2144551277160645, "loss": 0.0493, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.3804931640625, "rewards/margins": 0.833962082862854, "rewards/rejected": -5.2144551277160645, "sft_loss": 4.057433128356934, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 0.5454910221823269, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.33761829137802124, "logits/rejected": -0.22594241797924042, "logps/chosen": -4.206428527832031, "logps/rejected": -5.088019847869873, "loss": 0.0494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.206428527832031, "rewards/margins": 0.8815921545028687, "rewards/rejected": -5.088019847869873, "sft_loss": 3.8602423667907715, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 0.6438981638318502, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.5344552993774414, "logits/rejected": -0.2571745216846466, "logps/chosen": -4.3302435874938965, "logps/rejected": -5.282116889953613, "loss": 0.05, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.3302435874938965, "rewards/margins": 0.9518739581108093, "rewards/rejected": -5.282116889953613, "sft_loss": 4.086483001708984, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 0.5958659951379691, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.29295647144317627, "logits/rejected": -0.08710186183452606, "logps/chosen": -4.312572002410889, "logps/rejected": -5.109742164611816, "loss": 0.0509, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.312572002410889, "rewards/margins": 0.7971704006195068, "rewards/rejected": -5.109742164611816, "sft_loss": 4.068450927734375, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 0.4619365533143964, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.33097267150878906, "logits/rejected": -0.307807981967926, "logps/chosen": -4.2144365310668945, "logps/rejected": -5.045806884765625, "loss": 0.0494, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.2144365310668945, "rewards/margins": 0.8313705325126648, "rewards/rejected": -5.045806884765625, "sft_loss": 3.9541831016540527, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 0.5208760764017685, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.291950523853302, "logits/rejected": -0.10099242627620697, "logps/chosen": -4.414985179901123, "logps/rejected": -5.283970832824707, "loss": 0.0494, "rewards/accuracies": 0.75, "rewards/chosen": -4.414985179901123, "rewards/margins": 0.8689855337142944, "rewards/rejected": -5.283970832824707, "sft_loss": 4.126601696014404, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 0.6338941827024959, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.2738792300224304, "logits/rejected": -0.18372344970703125, "logps/chosen": -4.438006401062012, "logps/rejected": -5.203482627868652, "loss": 0.0505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.438006401062012, "rewards/margins": 0.7654756903648376, "rewards/rejected": -5.203482627868652, "sft_loss": 4.058730602264404, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 0.5443912681849259, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.23748035728931427, "logits/rejected": -0.3234420716762543, "logps/chosen": -4.397795677185059, "logps/rejected": -4.979608058929443, "loss": 0.0515, "rewards/accuracies": 0.625, "rewards/chosen": -4.397795677185059, "rewards/margins": 0.5818119049072266, "rewards/rejected": -4.979608058929443, "sft_loss": 4.0953369140625, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 0.535553105450855, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.3057805299758911, "logits/rejected": -0.2653143107891083, "logps/chosen": -4.209263324737549, "logps/rejected": -4.970522880554199, "loss": 0.0512, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.209263324737549, "rewards/margins": 0.7612598538398743, "rewards/rejected": -4.970522880554199, "sft_loss": 3.9734046459198, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 1.4483798734565578, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.512469470500946, "logits/rejected": -0.31466788053512573, "logps/chosen": -4.2295098304748535, "logps/rejected": -5.069262981414795, "loss": 0.0502, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.2295098304748535, "rewards/margins": 0.8397535085678101, "rewards/rejected": -5.069262981414795, "sft_loss": 3.985532283782959, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 0.9910451717757924, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.28064030408859253, "logits/rejected": -0.30460792779922485, "logps/chosen": -4.310183525085449, "logps/rejected": -5.07125186920166, "loss": 0.0514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.310183525085449, "rewards/margins": 0.7610687017440796, "rewards/rejected": -5.07125186920166, "sft_loss": 4.0810065269470215, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 0.5874113570648387, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.38583841919898987, "logits/rejected": -0.3223554491996765, "logps/chosen": -4.332071781158447, "logps/rejected": -5.063580513000488, "loss": 0.0525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.332071781158447, "rewards/margins": 0.7315087914466858, "rewards/rejected": -5.063580513000488, "sft_loss": 4.141894817352295, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 0.9590204351063107, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.41534432768821716, "logits/rejected": -0.1952972561120987, "logps/chosen": -4.223122596740723, "logps/rejected": -5.143168926239014, "loss": 0.0497, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.223122596740723, "rewards/margins": 0.9200462102890015, "rewards/rejected": -5.143168926239014, "sft_loss": 3.9259402751922607, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 0.5117516358708537, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.3982107639312744, "logits/rejected": -0.241659015417099, "logps/chosen": -4.251479625701904, "logps/rejected": -5.179772853851318, "loss": 0.0488, "rewards/accuracies": 0.75, "rewards/chosen": -4.251479625701904, "rewards/margins": 0.9282932281494141, "rewards/rejected": -5.179772853851318, "sft_loss": 3.968653440475464, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 0.8535394718224574, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.23298446834087372, "logits/rejected": -0.21911530196666718, "logps/chosen": -4.3338518142700195, "logps/rejected": -4.973204612731934, "loss": 0.051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.3338518142700195, "rewards/margins": 0.6393526196479797, "rewards/rejected": -4.973204612731934, "sft_loss": 3.9991040229797363, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 0.6187135581988519, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.32231375575065613, "logits/rejected": -0.2524074912071228, "logps/chosen": -4.291468620300293, "logps/rejected": -5.2907395362854, "loss": 0.0501, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.291468620300293, "rewards/margins": 0.9992705583572388, "rewards/rejected": -5.2907395362854, "sft_loss": 4.086352825164795, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 0.5124358191216212, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.43386736512184143, "logits/rejected": -0.15008467435836792, "logps/chosen": -4.215126991271973, "logps/rejected": -5.108205318450928, "loss": 0.0494, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.215126991271973, "rewards/margins": 0.8930784463882446, "rewards/rejected": -5.108205318450928, "sft_loss": 3.9824631214141846, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 0.540372134185639, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.26383471488952637, "logits/rejected": -0.19916556775569916, "logps/chosen": -4.226814270019531, "logps/rejected": -4.9440226554870605, "loss": 0.0515, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.226814270019531, "rewards/margins": 0.7172079086303711, "rewards/rejected": -4.9440226554870605, "sft_loss": 3.9626152515411377, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 0.6233774203905628, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.382610023021698, "logits/rejected": -0.19507412612438202, "logps/chosen": -4.561144828796387, "logps/rejected": -5.150514125823975, "loss": 0.0525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.561144828796387, "rewards/margins": 0.5893692970275879, "rewards/rejected": -5.150514125823975, "sft_loss": 4.331704139709473, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 0.4977719197784728, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.5408740639686584, "logits/rejected": -0.2679939866065979, "logps/chosen": -4.267232418060303, "logps/rejected": -5.070743083953857, "loss": 0.0504, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.267232418060303, "rewards/margins": 0.8035109639167786, "rewards/rejected": -5.070743083953857, "sft_loss": 4.043534278869629, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 0.381660827593946, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.4261409342288971, "logits/rejected": -0.1743827611207962, "logps/chosen": -4.142676830291748, "logps/rejected": -5.355725288391113, "loss": 0.0469, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.142676830291748, "rewards/margins": 1.2130485773086548, "rewards/rejected": -5.355725288391113, "sft_loss": 3.7679145336151123, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 0.43441229618811533, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.3080061376094818, "logits/rejected": -0.2564330995082855, "logps/chosen": -4.471229076385498, "logps/rejected": -5.220850944519043, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.471229076385498, "rewards/margins": 0.7496218681335449, "rewards/rejected": -5.220850944519043, "sft_loss": 4.176270484924316, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 0.5246889640905302, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.38305962085723877, "logits/rejected": -0.2757953405380249, "logps/chosen": -4.224427700042725, "logps/rejected": -5.185813903808594, "loss": 0.0488, "rewards/accuracies": 0.71875, "rewards/chosen": -4.224427700042725, "rewards/margins": 0.9613859057426453, "rewards/rejected": -5.185813903808594, "sft_loss": 3.924294948577881, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 0.43745005810743176, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.4080559313297272, "logits/rejected": -0.18248072266578674, "logps/chosen": -4.294047832489014, "logps/rejected": -5.07814884185791, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.294047832489014, "rewards/margins": 0.7841013669967651, "rewards/rejected": -5.07814884185791, "sft_loss": 4.130050182342529, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 0.5080954271922328, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.3728869557380676, "logits/rejected": -0.29249298572540283, "logps/chosen": -4.324324607849121, "logps/rejected": -4.972744941711426, "loss": 0.0529, "rewards/accuracies": 0.65625, "rewards/chosen": -4.324324607849121, "rewards/margins": 0.6484203338623047, "rewards/rejected": -4.972744941711426, "sft_loss": 4.04088020324707, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 0.5684492771021781, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.39560359716415405, "logits/rejected": -0.22769984602928162, "logps/chosen": -4.299488544464111, "logps/rejected": -5.184499740600586, "loss": 0.0488, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.299488544464111, "rewards/margins": 0.8850114941596985, "rewards/rejected": -5.184499740600586, "sft_loss": 3.9442543983459473, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 0.48701394112032037, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.41835254430770874, "logits/rejected": -0.2046811580657959, "logps/chosen": -4.341827392578125, "logps/rejected": -5.209946632385254, "loss": 0.0503, "rewards/accuracies": 0.75, "rewards/chosen": -4.341827392578125, "rewards/margins": 0.8681195378303528, "rewards/rejected": -5.209946632385254, "sft_loss": 4.115506172180176, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 0.49401293076834946, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.4481055736541748, "logits/rejected": -0.37085479497909546, "logps/chosen": -4.246690273284912, "logps/rejected": -5.07697057723999, "loss": 0.0508, "rewards/accuracies": 0.71875, "rewards/chosen": -4.246690273284912, "rewards/margins": 0.8302801847457886, "rewards/rejected": -5.07697057723999, "sft_loss": 4.007492542266846, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 0.5275705290982255, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.4367121160030365, "logits/rejected": -0.2713041305541992, "logps/chosen": -4.409333229064941, "logps/rejected": -5.059312343597412, "loss": 0.0515, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.409333229064941, "rewards/margins": 0.6499795913696289, "rewards/rejected": -5.059312343597412, "sft_loss": 4.199063301086426, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 0.6097523673192077, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.17959146201610565, "logits/rejected": -0.11355265229940414, "logps/chosen": -4.34035062789917, "logps/rejected": -5.241170406341553, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.34035062789917, "rewards/margins": 0.900820255279541, "rewards/rejected": -5.241170406341553, "sft_loss": 4.116297721862793, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 0.8818009140689909, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.17665904760360718, "logits/rejected": -0.10692217200994492, "logps/chosen": -4.236802101135254, "logps/rejected": -5.121260643005371, "loss": 0.0502, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.236802101135254, "rewards/margins": 0.8844582438468933, "rewards/rejected": -5.121260643005371, "sft_loss": 4.044351100921631, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 0.6358504999909266, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.43429645895957947, "logits/rejected": -0.229470893740654, "logps/chosen": -4.325311660766602, "logps/rejected": -4.965158462524414, "loss": 0.0526, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.325311660766602, "rewards/margins": 0.6398465633392334, "rewards/rejected": -4.965158462524414, "sft_loss": 4.134278774261475, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 0.5806505011410361, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.3749215602874756, "logits/rejected": -0.21662676334381104, "logps/chosen": -4.304712295532227, "logps/rejected": -5.107754707336426, "loss": 0.0491, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.304712295532227, "rewards/margins": 0.8030425906181335, "rewards/rejected": -5.107754707336426, "sft_loss": 3.934671401977539, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 0.557229329842499, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.4316619038581848, "logits/rejected": -0.16651205718517303, "logps/chosen": -4.384096145629883, "logps/rejected": -5.088347911834717, "loss": 0.0507, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.384096145629883, "rewards/margins": 0.7042518854141235, "rewards/rejected": -5.088347911834717, "sft_loss": 4.104673862457275, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 0.6567540054055532, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.34968018531799316, "logits/rejected": -0.2774543762207031, "logps/chosen": -4.437806129455566, "logps/rejected": -5.0514817237854, "loss": 0.054, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.437806129455566, "rewards/margins": 0.6136748790740967, "rewards/rejected": -5.0514817237854, "sft_loss": 4.183988094329834, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 0.4106224978496865, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.2766024172306061, "logits/rejected": -0.3795970380306244, "logps/chosen": -4.4523186683654785, "logps/rejected": -5.04266881942749, "loss": 0.0514, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.4523186683654785, "rewards/margins": 0.5903505086898804, "rewards/rejected": -5.04266881942749, "sft_loss": 4.153774738311768, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 0.5261648848812568, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.3736271858215332, "logits/rejected": -0.25079482793807983, "logps/chosen": -4.285164833068848, "logps/rejected": -5.054616451263428, "loss": 0.0504, "rewards/accuracies": 0.6875, "rewards/chosen": -4.285164833068848, "rewards/margins": 0.7694514989852905, "rewards/rejected": -5.054616451263428, "sft_loss": 4.0142927169799805, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 0.5227266083540951, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.3007558286190033, "logits/rejected": -0.07791855931282043, "logps/chosen": -4.21551513671875, "logps/rejected": -5.0369462966918945, "loss": 0.0503, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.21551513671875, "rewards/margins": 0.8214312791824341, "rewards/rejected": -5.0369462966918945, "sft_loss": 3.962756633758545, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 0.5479204943216744, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.37504953145980835, "logits/rejected": -0.34630855917930603, "logps/chosen": -4.44875431060791, "logps/rejected": -5.066702365875244, "loss": 0.0508, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.44875431060791, "rewards/margins": 0.6179476976394653, "rewards/rejected": -5.066702365875244, "sft_loss": 4.060311317443848, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 0.6364064477895179, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.45711517333984375, "logits/rejected": -0.20373289287090302, "logps/chosen": -4.335000991821289, "logps/rejected": -5.186996936798096, "loss": 0.051, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.335000991821289, "rewards/margins": 0.8519953489303589, "rewards/rejected": -5.186996936798096, "sft_loss": 4.1510090827941895, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 0.37950094909926635, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.2739645838737488, "logits/rejected": -0.19639113545417786, "logps/chosen": -4.4395341873168945, "logps/rejected": -5.209003448486328, "loss": 0.0501, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.4395341873168945, "rewards/margins": 0.7694700956344604, "rewards/rejected": -5.209003448486328, "sft_loss": 4.123523235321045, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 0.5038478075619528, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.2963625490665436, "logits/rejected": -0.3096460700035095, "logps/chosen": -4.370425701141357, "logps/rejected": -5.104419231414795, "loss": 0.0509, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.370425701141357, "rewards/margins": 0.7339931130409241, "rewards/rejected": -5.104419231414795, "sft_loss": 4.177587509155273, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 0.7755431035459227, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.3818827271461487, "logits/rejected": -0.21020355820655823, "logps/chosen": -4.361794948577881, "logps/rejected": -5.4317145347595215, "loss": 0.0504, "rewards/accuracies": 0.78125, "rewards/chosen": -4.361794948577881, "rewards/margins": 1.0699187517166138, "rewards/rejected": -5.4317145347595215, "sft_loss": 4.175992965698242, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 0.47191157552693663, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.46422845125198364, "logits/rejected": -0.30519360303878784, "logps/chosen": -4.419317722320557, "logps/rejected": -5.268907070159912, "loss": 0.0495, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.419317722320557, "rewards/margins": 0.849589467048645, "rewards/rejected": -5.268907070159912, "sft_loss": 3.9879608154296875, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 0.6659865489877009, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.35505035519599915, "logits/rejected": -0.3015265166759491, "logps/chosen": -4.270551681518555, "logps/rejected": -5.060952186584473, "loss": 0.0502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.270551681518555, "rewards/margins": 0.7904006838798523, "rewards/rejected": -5.060952186584473, "sft_loss": 3.9596004486083984, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 0.5271862878415422, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.39922767877578735, "logits/rejected": -0.200235053896904, "logps/chosen": -4.428971290588379, "logps/rejected": -5.323958396911621, "loss": 0.0519, "rewards/accuracies": 0.75, "rewards/chosen": -4.428971290588379, "rewards/margins": 0.8949869871139526, "rewards/rejected": -5.323958396911621, "sft_loss": 4.2644548416137695, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 0.7184273436158761, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.46444278955459595, "logits/rejected": -0.20382392406463623, "logps/chosen": -4.392231464385986, "logps/rejected": -5.154418468475342, "loss": 0.0511, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.392231464385986, "rewards/margins": 0.7621868848800659, "rewards/rejected": -5.154418468475342, "sft_loss": 4.077957630157471, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 0.7951708455272362, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.3619312644004822, "logits/rejected": -0.32184141874313354, "logps/chosen": -4.419110298156738, "logps/rejected": -5.143651008605957, "loss": 0.0528, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.419110298156738, "rewards/margins": 0.7245412468910217, "rewards/rejected": -5.143651008605957, "sft_loss": 4.224579334259033, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 0.5457381779871645, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.47487345337867737, "logits/rejected": -0.3351452052593231, "logps/chosen": -4.251893043518066, "logps/rejected": -5.026724338531494, "loss": 0.0503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.251893043518066, "rewards/margins": 0.7748310565948486, "rewards/rejected": -5.026724338531494, "sft_loss": 4.000130653381348, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 0.5333018714647327, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.3330259919166565, "logits/rejected": -0.22684387862682343, "logps/chosen": -4.1902852058410645, "logps/rejected": -5.100726127624512, "loss": 0.0498, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.1902852058410645, "rewards/margins": 0.9104412794113159, "rewards/rejected": -5.100726127624512, "sft_loss": 3.9527220726013184, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 0.5068612902949374, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.23308193683624268, "logits/rejected": -0.19373756647109985, "logps/chosen": -4.388981342315674, "logps/rejected": -5.182524681091309, "loss": 0.0505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.388981342315674, "rewards/margins": 0.7935434579849243, "rewards/rejected": -5.182524681091309, "sft_loss": 4.066475868225098, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 0.5814757487353343, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.35087448358535767, "logits/rejected": -0.12827900052070618, "logps/chosen": -4.3805389404296875, "logps/rejected": -5.340671539306641, "loss": 0.0493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.3805389404296875, "rewards/margins": 0.9601324796676636, "rewards/rejected": -5.340671539306641, "sft_loss": 4.0317511558532715, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 0.550436838494477, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.2993507981300354, "logits/rejected": -0.29762619733810425, "logps/chosen": -4.355318546295166, "logps/rejected": -5.036417007446289, "loss": 0.0509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.355318546295166, "rewards/margins": 0.6810978651046753, "rewards/rejected": -5.036417007446289, "sft_loss": 4.080462455749512, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 0.5420103145317201, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.4515206217765808, "logits/rejected": -0.34055566787719727, "logps/chosen": -4.29352331161499, "logps/rejected": -5.112155437469482, "loss": 0.0513, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.29352331161499, "rewards/margins": 0.818631649017334, "rewards/rejected": -5.112155437469482, "sft_loss": 4.067957878112793, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 0.5884906325479862, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.34654372930526733, "logits/rejected": -0.20942942798137665, "logps/chosen": -4.317554950714111, "logps/rejected": -5.352757453918457, "loss": 0.0479, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.317554950714111, "rewards/margins": 1.0352026224136353, "rewards/rejected": -5.352757453918457, "sft_loss": 3.913717269897461, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 0.8380934579646794, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.3536500930786133, "logits/rejected": -0.17102427780628204, "logps/chosen": -4.269620418548584, "logps/rejected": -5.160853385925293, "loss": 0.0498, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.269620418548584, "rewards/margins": 0.8912326693534851, "rewards/rejected": -5.160853385925293, "sft_loss": 3.962118148803711, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 0.5255025356739468, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.5044499635696411, "logits/rejected": -0.19058753550052643, "logps/chosen": -4.287683963775635, "logps/rejected": -5.2070136070251465, "loss": 0.0494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.287683963775635, "rewards/margins": 0.9193302392959595, "rewards/rejected": -5.2070136070251465, "sft_loss": 3.977626323699951, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 0.5414871942816999, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.46721917390823364, "logits/rejected": -0.25959575176239014, "logps/chosen": -4.274872779846191, "logps/rejected": -5.1689348220825195, "loss": 0.0508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.274872779846191, "rewards/margins": 0.8940622210502625, "rewards/rejected": -5.1689348220825195, "sft_loss": 4.059477806091309, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 0.40066954094597396, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.3345833122730255, "logits/rejected": -0.25943347811698914, "logps/chosen": -4.5316338539123535, "logps/rejected": -5.23195219039917, "loss": 0.052, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.5316338539123535, "rewards/margins": 0.7003186345100403, "rewards/rejected": -5.23195219039917, "sft_loss": 4.1835479736328125, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 0.6992304630126898, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.36338135600090027, "logits/rejected": -0.14956054091453552, "logps/chosen": -4.3805952072143555, "logps/rejected": -5.219130039215088, "loss": 0.0502, "rewards/accuracies": 0.71875, "rewards/chosen": -4.3805952072143555, "rewards/margins": 0.838534951210022, "rewards/rejected": -5.219130039215088, "sft_loss": 4.159787178039551, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 0.5406575761049948, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.3695183992385864, "logits/rejected": -0.23002979159355164, "logps/chosen": -4.364432334899902, "logps/rejected": -5.423884868621826, "loss": 0.0496, "rewards/accuracies": 0.71875, "rewards/chosen": -4.364432334899902, "rewards/margins": 1.0594522953033447, "rewards/rejected": -5.423884868621826, "sft_loss": 4.099183082580566, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.05091719701886177, "eval_logits/rejected": 0.14437253773212433, "eval_logps/chosen": -4.325172424316406, "eval_logps/rejected": -5.104435920715332, "eval_loss": 0.04997369274497032, "eval_rewards/accuracies": 0.68916916847229, "eval_rewards/chosen": -4.325172424316406, "eval_rewards/margins": 0.7792637348175049, "eval_rewards/rejected": -5.104435920715332, "eval_runtime": 45.1351, "eval_samples_per_second": 29.799, "eval_sft_loss": 3.922001361846924, "eval_steps_per_second": 7.466, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.05868237358195538, "train_runtime": 34511.7277, "train_samples_per_second": 5.197, "train_steps_per_second": 0.162 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }